Newer
Older
137001
137002
137003
137004
137005
137006
137007
137008
137009
137010
137011
137012
137013
137014
137015
137016
137017
137018
137019
137020
137021
137022
137023
137024
137025
137026
137027
137028
137029
137030
137031
137032
137033
137034
137035
137036
137037
137038
137039
137040
137041
137042
137043
137044
137045
137046
137047
137048
137049
137050
137051
137052
137053
137054
137055
137056
137057
137058
137059
137060
137061
137062
137063
137064
137065
137066
137067
137068
137069
137070
137071
137072
137073
137074
137075
137076
137077
137078
137079
137080
137081
137082
137083
137084
137085
137086
137087
137088
137089
137090
137091
137092
137093
137094
137095
137096
137097
137098
137099
137100
137101
137102
137103
137104
137105
137106
137107
137108
137109
137110
137111
137112
137113
137114
137115
137116
137117
137118
137119
137120
137121
137122
137123
137124
137125
137126
137127
137128
137129
137130
137131
137132
137133
137134
137135
137136
137137
137138
137139
137140
137141
137142
137143
137144
137145
137146
137147
137148
137149
137150
137151
137152
137153
137154
137155
137156
137157
137158
137159
137160
137161
137162
137163
137164
137165
137166
137167
137168
137169
137170
137171
137172
137173
137174
137175
137176
137177
137178
137179
137180
137181
137182
137183
137184
137185
137186
137187
137188
137189
137190
137191
137192
137193
137194
137195
137196
137197
137198
137199
137200
137201
137202
137203
137204
137205
137206
137207
137208
137209
137210
137211
137212
137213
137214
137215
137216
137217
137218
137219
137220
137221
137222
137223
137224
137225
137226
137227
137228
137229
137230
137231
137232
137233
137234
137235
137236
137237
137238
137239
137240
137241
137242
137243
137244
137245
137246
137247
137248
137249
137250
137251
137252
137253
137254
137255
137256
137257
137258
137259
137260
137261
137262
137263
137264
137265
137266
137267
137268
137269
137270
137271
137272
137273
137274
137275
137276
137277
137278
137279
137280
137281
137282
137283
137284
137285
137286
137287
137288
137289
137290
137291
137292
137293
137294
137295
137296
137297
137298
137299
137300
137301
137302
137303
137304
137305
137306
137307
137308
137309
137310
137311
137312
137313
137314
137315
137316
137317
137318
137319
137320
137321
137322
137323
137324
137325
137326
137327
137328
137329
137330
137331
137332
137333
137334
137335
137336
137337
137338
137339
137340
137341
137342
137343
137344
137345
137346
137347
137348
137349
137350
137351
137352
137353
137354
137355
137356
137357
137358
137359
137360
137361
137362
137363
137364
137365
137366
137367
137368
137369
137370
137371
137372
137373
137374
137375
137376
137377
137378
137379
137380
137381
137382
137383
137384
137385
137386
137387
137388
137389
137390
137391
137392
137393
137394
137395
137396
137397
137398
137399
137400
137401
137402
137403
137404
137405
137406
137407
137408
137409
137410
137411
137412
137413
137414
if( sqlite3_user_data(p) ){
u_strToUpper(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
}else{
u_strToLower(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
}
if( !U_SUCCESS(status) ){
icuFunctionError(p, "u_strToLower()/u_strToUpper", status);
return;
}
sqlite3_result_text16(p, zOutput, -1, xFree);
}
/*
** Collation sequence destructor function. The pCtx argument points to
** a UCollator structure previously allocated using ucol_open().
*/
static void icuCollationDel(void *pCtx){
UCollator *p = (UCollator *)pCtx;
ucol_close(p);
}
/*
** Collation sequence comparison function. The pCtx argument points to
** a UCollator structure previously allocated using ucol_open().
*/
static int icuCollationColl(
void *pCtx,
int nLeft,
const void *zLeft,
int nRight,
const void *zRight
){
UCollationResult res;
UCollator *p = (UCollator *)pCtx;
res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2);
switch( res ){
case UCOL_LESS: return -1;
case UCOL_GREATER: return +1;
case UCOL_EQUAL: return 0;
}
assert(!"Unexpected return value from ucol_strcoll()");
return 0;
}
/*
** Implementation of the scalar function icu_load_collation().
**
** This scalar function is used to add ICU collation based collation
** types to an SQLite database connection. It is intended to be called
** as follows:
**
** SELECT icu_load_collation(<locale>, <collation-name>);
**
** Where <locale> is a string containing an ICU locale identifier (i.e.
** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
** collation sequence to create.
*/
static void icuLoadCollation(
sqlite3_context *p,
int nArg,
sqlite3_value **apArg
){
sqlite3 *db = (sqlite3 *)sqlite3_user_data(p);
UErrorCode status = U_ZERO_ERROR;
const char *zLocale; /* Locale identifier - (eg. "jp_JP") */
const char *zName; /* SQL Collation sequence name (eg. "japanese") */
UCollator *pUCollator; /* ICU library collation object */
int rc; /* Return code from sqlite3_create_collation_x() */
assert(nArg==2);
zLocale = (const char *)sqlite3_value_text(apArg[0]);
zName = (const char *)sqlite3_value_text(apArg[1]);
if( !zLocale || !zName ){
return;
}
pUCollator = ucol_open(zLocale, &status);
if( !U_SUCCESS(status) ){
icuFunctionError(p, "ucol_open", status);
return;
}
assert(p);
rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator,
icuCollationColl, icuCollationDel
);
if( rc!=SQLITE_OK ){
ucol_close(pUCollator);
sqlite3_result_error(p, "Error registering collation function", -1);
}
}
/*
** Register the ICU extension functions with database db.
*/
SQLITE_PRIVATE int sqlite3IcuInit(sqlite3 *db){
struct IcuScalar {
const char *zName; /* Function name */
int nArg; /* Number of arguments */
int enc; /* Optimal text encoding */
void *pContext; /* sqlite3_user_data() context */
void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
} scalars[] = {
{"regexp", 2, SQLITE_ANY, 0, icuRegexpFunc},
{"lower", 1, SQLITE_UTF16, 0, icuCaseFunc16},
{"lower", 2, SQLITE_UTF16, 0, icuCaseFunc16},
{"upper", 1, SQLITE_UTF16, (void*)1, icuCaseFunc16},
{"upper", 2, SQLITE_UTF16, (void*)1, icuCaseFunc16},
{"lower", 1, SQLITE_UTF8, 0, icuCaseFunc16},
{"lower", 2, SQLITE_UTF8, 0, icuCaseFunc16},
{"upper", 1, SQLITE_UTF8, (void*)1, icuCaseFunc16},
{"upper", 2, SQLITE_UTF8, (void*)1, icuCaseFunc16},
{"like", 2, SQLITE_UTF8, 0, icuLikeFunc},
{"like", 3, SQLITE_UTF8, 0, icuLikeFunc},
{"icu_load_collation", 2, SQLITE_UTF8, (void*)db, icuLoadCollation},
};
int rc = SQLITE_OK;
int i;
for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){
struct IcuScalar *p = &scalars[i];
rc = sqlite3_create_function(
db, p->zName, p->nArg, p->enc, p->pContext, p->xFunc, 0, 0
);
}
return rc;
}
#if !SQLITE_CORE
SQLITE_API int sqlite3_extension_init(
sqlite3 *db,
char **pzErrMsg,
const sqlite3_api_routines *pApi
){
SQLITE_EXTENSION_INIT2(pApi)
return sqlite3IcuInit(db);
}
#endif
#endif
/************** End of icu.c *************************************************/
/************** Begin file fts3_icu.c ****************************************/
/*
** 2007 June 22
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This file implements a tokenizer for fts3 based on the ICU library.
*/
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
#ifdef SQLITE_ENABLE_ICU
/* #include <assert.h> */
/* #include <string.h> */
#include <unicode/ubrk.h>
/* #include <unicode/ucol.h> */
/* #include <unicode/ustring.h> */
#include <unicode/utf16.h>
typedef struct IcuTokenizer IcuTokenizer;
typedef struct IcuCursor IcuCursor;
struct IcuTokenizer {
sqlite3_tokenizer base;
char *zLocale;
};
struct IcuCursor {
sqlite3_tokenizer_cursor base;
UBreakIterator *pIter; /* ICU break-iterator object */
int nChar; /* Number of UChar elements in pInput */
UChar *aChar; /* Copy of input using utf-16 encoding */
int *aOffset; /* Offsets of each character in utf-8 input */
int nBuffer;
char *zBuffer;
int iToken;
};
/*
** Create a new tokenizer instance.
*/
static int icuCreate(
int argc, /* Number of entries in argv[] */
const char * const *argv, /* Tokenizer creation arguments */
sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
){
IcuTokenizer *p;
int n = 0;
if( argc>0 ){
n = strlen(argv[0])+1;
}
p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
if( !p ){
return SQLITE_NOMEM;
}
memset(p, 0, sizeof(IcuTokenizer));
if( n ){
p->zLocale = (char *)&p[1];
memcpy(p->zLocale, argv[0], n);
}
*ppTokenizer = (sqlite3_tokenizer *)p;
return SQLITE_OK;
}
/*
** Destroy a tokenizer
*/
static int icuDestroy(sqlite3_tokenizer *pTokenizer){
IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
sqlite3_free(p);
return SQLITE_OK;
}
/*
** Prepare to begin tokenizing a particular string. The input
** string to be tokenized is pInput[0..nBytes-1]. A cursor
** used to incrementally tokenize this string is returned in
** *ppCursor.
*/
static int icuOpen(
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
const char *zInput, /* Input string */
int nInput, /* Length of zInput in bytes */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
){
IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
IcuCursor *pCsr;
const int32_t opt = U_FOLD_CASE_DEFAULT;
UErrorCode status = U_ZERO_ERROR;
int nChar;
UChar32 c;
int iInput = 0;
int iOut = 0;
*ppCursor = 0;
if( zInput==0 ){
nInput = 0;
zInput = "";
}else if( nInput<0 ){
nInput = strlen(zInput);
}
nChar = nInput+1;
pCsr = (IcuCursor *)sqlite3_malloc(
sizeof(IcuCursor) + /* IcuCursor */
((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */
(nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
);
if( !pCsr ){
return SQLITE_NOMEM;
}
memset(pCsr, 0, sizeof(IcuCursor));
pCsr->aChar = (UChar *)&pCsr[1];
pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3];
pCsr->aOffset[iOut] = iInput;
U8_NEXT(zInput, iInput, nInput, c);
while( c>0 ){
int isError = 0;
c = u_foldCase(c, opt);
U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
if( isError ){
sqlite3_free(pCsr);
return SQLITE_ERROR;
}
pCsr->aOffset[iOut] = iInput;
if( iInput<nInput ){
U8_NEXT(zInput, iInput, nInput, c);
}else{
c = 0;
}
}
pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
if( !U_SUCCESS(status) ){
sqlite3_free(pCsr);
return SQLITE_ERROR;
}
pCsr->nChar = iOut;
ubrk_first(pCsr->pIter);
*ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
return SQLITE_OK;
}
/*
** Close a tokenization cursor previously opened by a call to icuOpen().
*/
static int icuClose(sqlite3_tokenizer_cursor *pCursor){
IcuCursor *pCsr = (IcuCursor *)pCursor;
ubrk_close(pCsr->pIter);
sqlite3_free(pCsr->zBuffer);
sqlite3_free(pCsr);
return SQLITE_OK;
}
/*
** Extract the next token from a tokenization cursor.
*/
static int icuNext(
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
const char **ppToken, /* OUT: *ppToken is the token text */
int *pnBytes, /* OUT: Number of bytes in token */
int *piStartOffset, /* OUT: Starting offset of token */
int *piEndOffset, /* OUT: Ending offset of token */
int *piPosition /* OUT: Position integer of token */
){
IcuCursor *pCsr = (IcuCursor *)pCursor;
int iStart = 0;
int iEnd = 0;
int nByte = 0;
while( iStart==iEnd ){
UChar32 c;
iStart = ubrk_current(pCsr->pIter);
iEnd = ubrk_next(pCsr->pIter);
if( iEnd==UBRK_DONE ){
return SQLITE_DONE;
}
while( iStart<iEnd ){
int iWhite = iStart;
U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
if( u_isspace(c) ){
iStart = iWhite;
}else{
break;
}
}
assert(iStart<=iEnd);
}
do {
UErrorCode status = U_ZERO_ERROR;
if( nByte ){
char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
if( !zNew ){
return SQLITE_NOMEM;
}
pCsr->zBuffer = zNew;
pCsr->nBuffer = nByte;
}
u_strToUTF8(
pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
&pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
&status /* Output success/failure */
);
} while( nByte>pCsr->nBuffer );
*ppToken = pCsr->zBuffer;
*pnBytes = nByte;
*piStartOffset = pCsr->aOffset[iStart];
*piEndOffset = pCsr->aOffset[iEnd];
*piPosition = pCsr->iToken++;
return SQLITE_OK;
}
/*
** The set of routines that implement the simple tokenizer
*/
static const sqlite3_tokenizer_module icuTokenizerModule = {
0, /* iVersion */
icuCreate, /* xCreate */
icuDestroy, /* xCreate */
icuOpen, /* xOpen */
icuClose, /* xClose */
icuNext, /* xNext */
};
/*
** Set *ppModule to point at the implementation of the ICU tokenizer.
*/
SQLITE_PRIVATE void sqlite3Fts3IcuTokenizerModule(
sqlite3_tokenizer_module const**ppModule
){
*ppModule = &icuTokenizerModule;
}
#endif /* defined(SQLITE_ENABLE_ICU) */
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
/************** End of fts3_icu.c ********************************************/