329 lines
9.7 KiB
C++
329 lines
9.7 KiB
C++
/*-------------------------------------------------------------------------
|
|
*
|
|
* unaccent.c
|
|
* Text search unaccent dictionary
|
|
*
|
|
* Copyright (c) 2009-2012, PostgreSQL Global Development Group
|
|
*
|
|
* IDENTIFICATION
|
|
* contrib/unaccent/unaccent.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
#include "knl/knl_variable.h"
|
|
|
|
#include "catalog/namespace.h"
|
|
#include "commands/defrem.h"
|
|
#include "tsearch/ts_cache.h"
|
|
#include "tsearch/ts_locale.h"
|
|
#include "tsearch/ts_public.h"
|
|
#include "utils/builtins.h"
|
|
|
|
PG_MODULE_MAGIC;
|
|
|
|
/*
|
|
* Unaccent dictionary uses uncompressed suffix tree to find a
|
|
* character to replace. Each node of tree is an array of
|
|
* SuffixChar struct with length = 256 (n-th element of array
|
|
* corresponds to byte)
|
|
*/
|
|
typedef struct SuffixChar {
|
|
struct SuffixChar* nextChar;
|
|
char* replaceTo;
|
|
int replacelen;
|
|
} SuffixChar;
|
|
|
|
/*
|
|
* placeChar - put str into tree's structure, byte by byte.
|
|
*/
|
|
static SuffixChar* placeChar(SuffixChar* node, unsigned char* str, int lenstr, char* replaceTo, int replacelen)
|
|
{
|
|
SuffixChar* curnode = NULL;
|
|
errno_t rc;
|
|
|
|
if (!node) {
|
|
node = (SuffixChar*)palloc(sizeof(SuffixChar) * 256);
|
|
rc = memset_s(node, sizeof(SuffixChar) * 256, 0, sizeof(SuffixChar) * 256);
|
|
securec_check_c(rc, "", "");
|
|
}
|
|
|
|
curnode = node + *str;
|
|
|
|
if (lenstr == 1) {
|
|
if (curnode->replaceTo)
|
|
elog(WARNING, "duplicate TO argument, use first one");
|
|
else {
|
|
curnode->replacelen = replacelen;
|
|
curnode->replaceTo = (char*)palloc(replacelen);
|
|
rc = memcpy_s(curnode->replaceTo, replacelen, replaceTo, replacelen);
|
|
securec_check_c(rc, "", "");
|
|
}
|
|
} else {
|
|
curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
|
|
}
|
|
|
|
return node;
|
|
}
|
|
|
|
/*
|
|
* initSuffixTree - create suffix tree from file. Function converts
|
|
* UTF8-encoded file into current encoding.
|
|
*/
|
|
static SuffixChar* initSuffixTree(char* filename)
|
|
{
|
|
SuffixChar* volatile rootSuffixTree = NULL;
|
|
MemoryContext ccxt = CurrentMemoryContext;
|
|
tsearch_readline_state trst;
|
|
volatile bool skip = false;
|
|
|
|
filename = get_tsearch_config_filename(filename, "rules");
|
|
if (!tsearch_readline_begin(&trst, filename))
|
|
ereport(
|
|
ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open unaccent file \"%s\": %m", filename)));
|
|
|
|
do {
|
|
/*
|
|
* pg_do_encoding_conversion() (called by tsearch_readline()) will
|
|
* emit exception if it finds untranslatable characters in current
|
|
* locale. We just skip such lines, continuing with the next.
|
|
*/
|
|
skip = true;
|
|
|
|
PG_TRY();
|
|
{
|
|
char* line = NULL;
|
|
|
|
while ((line = tsearch_readline(&trst)) != NULL) {
|
|
/*
|
|
* The format of each line must be "src trg" where src and trg
|
|
* are sequences of one or more non-whitespace characters,
|
|
* separated by whitespace. Whitespace at start or end of
|
|
* line is ignored.
|
|
*/
|
|
int state;
|
|
char* ptr = NULL;
|
|
char* src = NULL;
|
|
char* trg = NULL;
|
|
int ptrlen;
|
|
int srclen = 0;
|
|
int trglen = 0;
|
|
|
|
state = 0;
|
|
for (ptr = line; *ptr; ptr += ptrlen) {
|
|
ptrlen = pg_mblen(ptr);
|
|
/* ignore whitespace, but end src or trg */
|
|
if (t_isspace(ptr)) {
|
|
if (state == 1)
|
|
state = 2;
|
|
else if (state == 3)
|
|
state = 4;
|
|
continue;
|
|
}
|
|
switch (state) {
|
|
case 0:
|
|
/* start of src */
|
|
src = ptr;
|
|
srclen = ptrlen;
|
|
state = 1;
|
|
break;
|
|
case 1:
|
|
/* continue src */
|
|
srclen += ptrlen;
|
|
break;
|
|
case 2:
|
|
/* start of trg */
|
|
trg = ptr;
|
|
trglen = ptrlen;
|
|
state = 3;
|
|
break;
|
|
case 3:
|
|
/* continue trg */
|
|
trglen += ptrlen;
|
|
break;
|
|
default:
|
|
/* bogus line format */
|
|
state = -1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (state >= 3)
|
|
rootSuffixTree = placeChar(rootSuffixTree, (unsigned char*)src, srclen, trg, trglen);
|
|
|
|
pfree(line);
|
|
}
|
|
skip = false;
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
ErrorData* errdata = NULL;
|
|
MemoryContext ecxt;
|
|
|
|
ecxt = MemoryContextSwitchTo(ccxt);
|
|
errdata = CopyErrorData();
|
|
if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) {
|
|
FlushErrorState();
|
|
} else {
|
|
MemoryContextSwitchTo(ecxt);
|
|
PG_RE_THROW();
|
|
}
|
|
}
|
|
PG_END_TRY();
|
|
} while (skip);
|
|
|
|
tsearch_readline_end(&trst);
|
|
|
|
return rootSuffixTree;
|
|
}
|
|
|
|
/*
|
|
* findReplaceTo - find multibyte character in tree
|
|
*/
|
|
static SuffixChar* findReplaceTo(SuffixChar* node, unsigned char* src, int srclen)
|
|
{
|
|
while (node) {
|
|
node = node + *src;
|
|
if (srclen == 1)
|
|
return node;
|
|
|
|
src++;
|
|
srclen--;
|
|
node = node->nextChar;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
PG_FUNCTION_INFO_V1(unaccent_init);
|
|
extern "C" Datum unaccent_init(PG_FUNCTION_ARGS);
|
|
Datum unaccent_init(PG_FUNCTION_ARGS)
|
|
{
|
|
List* dictoptions = (List*)PG_GETARG_POINTER(0);
|
|
SuffixChar* rootSuffixTree = NULL;
|
|
bool fileloaded = false;
|
|
ListCell* l = NULL;
|
|
|
|
foreach (l, dictoptions) {
|
|
DefElem* defel = (DefElem*)lfirst(l);
|
|
|
|
if (pg_strcasecmp("Rules", defel->defname) == 0) {
|
|
if (fileloaded)
|
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple Rules parameters")));
|
|
rootSuffixTree = initSuffixTree(defGetString(defel));
|
|
fileloaded = true;
|
|
} else {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("unrecognized Unaccent parameter: \"%s\"", defel->defname)));
|
|
}
|
|
}
|
|
|
|
if (!fileloaded) {
|
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("missing Rules parameter")));
|
|
}
|
|
|
|
PG_RETURN_POINTER(rootSuffixTree);
|
|
}
|
|
|
|
PG_FUNCTION_INFO_V1(unaccent_lexize);
|
|
extern "C" Datum unaccent_lexize(PG_FUNCTION_ARGS);
|
|
Datum unaccent_lexize(PG_FUNCTION_ARGS)
|
|
{
|
|
SuffixChar* rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0);
|
|
char* srcchar = (char*)PG_GETARG_POINTER(1);
|
|
int32 len = PG_GETARG_INT32(2);
|
|
char* srcstart = NULL;
|
|
char* trgchar = NULL;
|
|
int charlen;
|
|
Size tarlen = 0;
|
|
TSLexeme* res = NULL;
|
|
SuffixChar* node = NULL;
|
|
errno_t rc;
|
|
|
|
srcstart = srcchar;
|
|
while (srcchar - srcstart < len) {
|
|
charlen = pg_mblen(srcchar);
|
|
|
|
node = findReplaceTo(rootSuffixTree, (unsigned char*)srcchar, charlen);
|
|
if (node && node->replaceTo) {
|
|
if (!res) {
|
|
/* allocate res only if it's needed */
|
|
res = (TSLexeme*)palloc0(sizeof(TSLexeme) * 2);
|
|
tarlen = (Size)len * pg_database_encoding_max_length() + 1; /* \0 */
|
|
res->lexeme = trgchar = (char*)palloc(tarlen);
|
|
res->flags = TSL_FILTER;
|
|
if (srcchar != srcstart) {
|
|
rc = memcpy_s(trgchar, tarlen, srcstart, srcchar - srcstart);
|
|
securec_check_c(rc, "", "");
|
|
trgchar += (srcchar - srcstart);
|
|
tarlen -= (srcchar - srcstart);
|
|
}
|
|
}
|
|
rc = memcpy_s(trgchar, node->replaceTo, node->replacelen);
|
|
|
|
trgchar += node->replacelen;
|
|
} else if (res) {
|
|
rc = memcpy_s(trgchar, charlen, srcchar, charlen);
|
|
securec_check_c(rc, "", "");
|
|
trgchar += charlen;
|
|
tarlen -= charlen;
|
|
}
|
|
|
|
srcchar += charlen;
|
|
}
|
|
|
|
if (res)
|
|
*trgchar = '\0';
|
|
|
|
PG_RETURN_POINTER(res);
|
|
}
|
|
|
|
/*
|
|
* Function-like wrapper for dictionary
|
|
*/
|
|
PG_FUNCTION_INFO_V1(unaccent_dict);
|
|
extern "C" Datum unaccent_dict(PG_FUNCTION_ARGS);
|
|
Datum unaccent_dict(PG_FUNCTION_ARGS)
|
|
{
|
|
text* str = NULL;
|
|
int strArg;
|
|
Oid dictOid;
|
|
TSDictionaryCacheEntry* dict = NULL;
|
|
TSLexeme* res = NULL;
|
|
|
|
if (PG_NARGS() == 1) {
|
|
dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
|
|
strArg = 0;
|
|
} else {
|
|
dictOid = PG_GETARG_OID(0);
|
|
strArg = 1;
|
|
}
|
|
str = PG_GETARG_TEXT_P(strArg);
|
|
|
|
dict = lookup_ts_dictionary_cache(dictOid);
|
|
|
|
res = (TSLexeme*)DatumGetPointer(FunctionCall4(&(dict->lexize),
|
|
PointerGetDatum(dict->dictData),
|
|
PointerGetDatum(VARDATA(str)),
|
|
Int32GetDatum(VARSIZE(str) - VARHDRSZ),
|
|
PointerGetDatum(NULL)));
|
|
|
|
PG_FREE_IF_COPY(str, strArg);
|
|
|
|
if (res == NULL) {
|
|
PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
|
|
} else if (res->lexeme == NULL) {
|
|
pfree(res);
|
|
PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
|
|
} else {
|
|
text* txt = cstring_to_text(res->lexeme);
|
|
|
|
pfree(res->lexeme);
|
|
pfree(res);
|
|
|
|
PG_RETURN_TEXT_P(txt);
|
|
}
|
|
}
|