2021SC@SDUSC
简介
???? ??本文我们介绍分词器部分的最后一个内容fts3_expr.c;fts3_expr.c这个文件主要是实现查询字符串功能(MATCH函数)。MATCH运算符用在全文检索中。例如这两句:
SELECT title, body FROM pages WHERE pages MATCH 'world';
SELECT title, body FROM pages WHERE title MATCH 'world';
???? ??这两句,前一个 MATCH 左边写了表名,后一个写的是列名。后一个仅搜索title列,前一个是搜索全部列(docid 列以外)。MATCH右侧的表达式支持模糊查询、支持指定列查询、支持 AND/OR/NEAR/NOT 等运算,例如:
SELECT title, body FROM pages WHERE pages MATCH 'hel*';
SELECT title, body FROM pages WHERE pages MATCH 'title:hello';
SELECT title, body FROM pages WHERE pages MATCH 'hello AND world';
SELECT title, body FROM pages WHERE pages MATCH '(hello NEAR world) OR (program AND language)';
源码分析
???? ??fts3_expr.c文件中包含下列几个函数
static int fts3isspace(char c){
return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
}
static void *fts3MallocZero(sqlite3_int64 nByte){
void *pRet = sqlite3_malloc64(nByte);
if( pRet ) memset(pRet, 0, nByte);
return pRet;
}
int sqlite3Fts3OpenTokenizer(
sqlite3_tokenizer *pTokenizer,
int iLangid,
const char *z,
int n,
sqlite3_tokenizer_cursor **ppCsr
){
sqlite3_tokenizer_module const *pModule = pTokenizer->pModule;
sqlite3_tokenizer_cursor *pCsr = 0;
int rc;
rc = pModule->xOpen(pTokenizer, z, n, &pCsr);
assert( rc==SQLITE_OK || pCsr==0 );
if( rc==SQLITE_OK ){
pCsr->pTokenizer = pTokenizer;
if( pModule->iVersion>=1 ){
rc = pModule->xLanguageid(pCsr, iLangid);
if( rc!=SQLITE_OK ){
pModule->xClose(pCsr);
pCsr = 0;
}
}
}
*ppCsr = pCsr;
return rc;
}
static int getNextToken(
ParseContext *pParse,
int iCol,
const char *z, int n,
Fts3Expr **ppExpr,
int *pnConsumed
){
sqlite3_tokenizer *pTokenizer = pParse->pTokenizer;
sqlite3_tokenizer_module const *pModule = pTokenizer->pModule;
int rc;
sqlite3_tokenizer_cursor *pCursor;
Fts3Expr *pRet = 0;
int i = 0;
for(i=0; i<n; i++){
if( sqlite3_fts3_enable_parentheses && (z[i]=='(' || z[i]==')') ) break;
if( z[i]=='"' ) break;
}
*pnConsumed = i;
rc = sqlite3Fts3OpenTokenizer(pTokenizer, pParse->iLangid, z, i, &pCursor);
if( rc==SQLITE_OK ){
const char *zToken;
int nToken = 0, iStart = 0, iEnd = 0, iPosition = 0;
sqlite3_int64 nByte;
rc = pModule->xNext(pCursor, &zToken, &nToken, &iStart, &iEnd, &iPosition);
if( rc==SQLITE_OK ){
nByte = sizeof(Fts3Expr) + sizeof(Fts3Phrase) + nToken;
pRet = (Fts3Expr *)fts3MallocZero(nByte);
if( !pRet ){
rc = SQLITE_NOMEM;
}else{
pRet->eType = FTSQUERY_PHRASE;
pRet->pPhrase = (Fts3Phrase *)&pRet[1];
pRet->pPhrase->nToken = 1;
pRet->pPhrase->iColumn = iCol;
pRet->pPhrase->aToken[0].n = nToken;
pRet->pPhrase->aToken[0].z = (char *)&pRet->pPhrase[1];
memcpy(pRet->pPhrase->aToken[0].z, zToken, nToken);
if( iEnd<n && z[iEnd]=='*' ){
pRet->pPhrase->aToken[0].isPrefix = 1;
iEnd++;
}
while( 1 ){
if( !sqlite3_fts3_enable_parentheses
&& iStart>0 && z[iStart-1]=='-'
){
pParse->isNot = 1;
iStart--;
}else if( pParse->bFts4 && iStart>0 && z[iStart-1]=='^' ){
pRet->pPhrase->aToken[0].bFirst = 1;
iStart--;
}else{
break;
}
}
}
*pnConsumed = iEnd;
}else if( i && rc==SQLITE_DONE ){
rc = SQLITE_OK;
}
pModule->xClose(pCursor);
}
*ppExpr = pRet;
return rc;
}
static int getNextString(
ParseContext *pParse,
const char *zInput, int nInput,
Fts3Expr **ppExpr
){
sqlite3_tokenizer *pTokenizer = pParse->pTokenizer;
sqlite3_tokenizer_module const *pModule = pTokenizer->pModule;
int rc;
Fts3Expr *p = 0;
sqlite3_tokenizer_cursor *pCursor = 0;
char *zTemp = 0;
int nTemp = 0;
const int nSpace = sizeof(Fts3Expr) + sizeof(Fts3Phrase);
int nToken = 0;
rc = sqlite3Fts3OpenTokenizer(
pTokenizer, pParse->iLangid, zInput, nInput, &pCursor);
if( rc==SQLITE_OK ){
int ii;
for(ii=0; rc==SQLITE_OK; ii++){
const char *zByte;
int nByte = 0, iBegin = 0, iEnd = 0, iPos = 0;
rc = pModule->xNext(pCursor, &zByte, &nByte, &iBegin, &iEnd, &iPos);
if( rc==SQLITE_OK ){
Fts3PhraseToken *pToken;
p = fts3ReallocOrFree(p, nSpace + ii*sizeof(Fts3PhraseToken));
if( !p ) goto no_mem;
zTemp = fts3ReallocOrFree(zTemp, nTemp + nByte);
if( !zTemp ) goto no_mem;
assert( nToken==ii );
pToken = &((Fts3Phrase *)(&p[1]))->aToken[ii];
memset(pToken, 0, sizeof(Fts3PhraseToken));
memcpy(&zTemp[nTemp], zByte, nByte);
nTemp += nByte;
pToken->n = nByte;
pToken->isPrefix = (iEnd<nInput && zInput[iEnd]=='*');
pToken->bFirst = (iBegin>0 && zInput[iBegin-1]=='^');
nToken = ii+1;
}
}
pModule->xClose(pCursor);
pCursor = 0;
}
if( rc==SQLITE_DONE ){
int jj;
char *zBuf = 0;
p = fts3ReallocOrFree(p, nSpace + nToken*sizeof(Fts3PhraseToken) + nTemp);
if( !p ) goto no_mem;
memset(p, 0, (char *)&(((Fts3Phrase *)&p[1])->aToken[0])-(char *)p);
p->eType = FTSQUERY_PHRASE;
p->pPhrase = (Fts3Phrase *)&p[1];
p->pPhrase->iColumn = pParse->iDefaultCol;
p->pPhrase->nToken = nToken;
zBuf = (char *)&p->pPhrase->aToken[nToken];
if( zTemp ){
memcpy(zBuf, zTemp, nTemp);
sqlite3_free(zTemp);
}else{
assert( nTemp==0 );
}
for(jj=0; jj<p->pPhrase->nToken; jj++){
p->pPhrase->aToken[jj].z = zBuf;
zBuf += p->pPhrase->aToken[jj].n;
}
rc = SQLITE_OK;
}
*ppExpr = p;
return rc;
no_mem:
if( pCursor ){
pModule->xClose(pCursor);
}
sqlite3_free(zTemp);
sqlite3_free(p);
*ppExpr = 0;
return SQLITE_NOMEM;
}
void sqlite3Fts3ExprFree(Fts3Expr *pDel){
Fts3Expr *p;
assert( pDel==0 || pDel->pParent==0 );
for(p=pDel; p && (p->pLeft||p->pRight); p=(p->pLeft ? p->pLeft : p->pRight)){
assert( p->pParent==0 || p==p->pParent->pRight || p==p->pParent->pLeft );
}
while( p ){
Fts3Expr *pParent = p->pParent;
fts3FreeExprNode(p);
if( pParent && p==pParent->pLeft && pParent->pRight ){
p = pParent->pRight;
while( p && (p->pLeft || p->pRight) ){
assert( p==p->pParent->pRight || p==p->pParent->pLeft );
p = (p->pLeft ? p->pLeft : p->pRight);
}
}else{
p = pParent;
}
}
}
解析
???? ??默认情况下,此模块将解析fts3传统上使用的遗留语法。或者,如果定义了SQLITE_ENABLE_FTS3_PARENTHESIS,那么它将使用新的语法。新语法和新旧语法之间的区别是:
a)新的语法支持括号。旧的没有。
b)新语法支持和和运算符。旧的没有。
c)旧的语法支持“-”令牌限定符。新的语法不支持这一点(它将被NOT操作符替换)。
d)当使用旧语法时,OR运算符比隐式和具有更大的优先级。当使用新的时,内爆和显式运算符都比OR有更高的优先级。
???? ??下面描述了fts3匹配操作符支持的与lemon解析器生成器使用的格式相似的语法。这个模块实际上并不使用lemon,它使用了一个自定义解析器。
query ::= andexpr (OR andexpr)*.
andexpr ::= notexpr (AND? notexpr)*.
notexpr ::= nearexpr (NOT nearexpr|-TOKEN)*.
notexpr ::= LP query RP.
nearexpr ::= phrase (NEAR distance_opt nearexpr)*.
distance_opt ::= .
distance_opt ::= / INTEGER.
phrase ::= TOKEN.
phrase ::= COLUMN:TOKEN.
phrase ::= "TOKEN TOKEN TOKEN...".
|