openGauss-server/contrib/unaccent/unaccent.cpp

/*-------------------------------------------------------------------------
 *
 * unaccent.c
 *	  Text search unaccent dictionary
 *
 * Copyright (c) 2009-2012, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  contrib/unaccent/unaccent.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"
#include "knl/knl_variable.h"

#include "catalog/namespace.h"
#include "commands/defrem.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "utils/builtins.h"

PG_MODULE_MAGIC;

/*
 * Unaccent dictionary uses uncompressed suffix tree to find a
 * character to replace. Each node of tree is an array of
 * SuffixChar struct with length = 256 (n-th element of array
 * corresponds to byte)
 */
typedef struct SuffixChar {
    struct SuffixChar* nextChar;
    char* replaceTo;
    int replacelen;
} SuffixChar;

/*
 * placeChar - put str into tree's structure, byte by byte.
 */
static SuffixChar* placeChar(SuffixChar* node, unsigned char* str, int lenstr, char* replaceTo, int replacelen)
{
    SuffixChar* curnode = NULL;
    errno_t rc;

    if (!node) {
        node = (SuffixChar*)palloc(sizeof(SuffixChar) * 256);
        rc = memset_s(node, sizeof(SuffixChar) * 256, 0, sizeof(SuffixChar) * 256);
        securec_check_c(rc, "", "");
    }

    curnode = node + *str;

    if (lenstr == 1) {
        if (curnode->replaceTo)
            elog(WARNING, "duplicate TO argument, use first one");
        else {
            curnode->replacelen = replacelen;
            curnode->replaceTo = (char*)palloc(replacelen);
            rc = memcpy_s(curnode->replaceTo, replacelen, replaceTo, replacelen);
            securec_check_c(rc, "", "");
        }
    } else {
        curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
    }

    return node;
}

/*
 * initSuffixTree  - create suffix tree from file. Function converts
 * UTF8-encoded file into current encoding.
 */
static SuffixChar* initSuffixTree(char* filename)
{
    SuffixChar* volatile rootSuffixTree = NULL;
    MemoryContext ccxt = CurrentMemoryContext;
    tsearch_readline_state trst;
    volatile bool skip = false;

    filename = get_tsearch_config_filename(filename, "rules");
    if (!tsearch_readline_begin(&trst, filename))
        ereport(
            ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open unaccent file \"%s\": %m", filename)));

    do {
        /*
         * pg_do_encoding_conversion() (called by tsearch_readline()) will
         * emit exception if it finds untranslatable characters in current
         * locale. We just skip such lines, continuing with the next.
         */
        skip = true;

        PG_TRY();
        {
            char* line = NULL;

            while ((line = tsearch_readline(&trst)) != NULL) {
                /*
                 * The format of each line must be "src trg" where src and trg
                 * are sequences of one or more non-whitespace characters,
                 * separated by whitespace.  Whitespace at start or end of
                 * line is ignored.
                 */
                int state;
                char* ptr = NULL;
                char* src = NULL;
                char* trg = NULL;
                int ptrlen;
                int srclen = 0;
                int trglen = 0;

                state = 0;
                for (ptr = line; *ptr; ptr += ptrlen) {
                    ptrlen = pg_mblen(ptr);
                    /* ignore whitespace, but end src or trg */
                    if (t_isspace(ptr)) {
                        if (state == 1)
                            state = 2;
                        else if (state == 3)
                            state = 4;
                        continue;
                    }
                    switch (state) {
                        case 0:
                            /* start of src */
                            src = ptr;
                            srclen = ptrlen;
                            state = 1;
                            break;
                        case 1:
                            /* continue src */
                            srclen += ptrlen;
                            break;
                        case 2:
                            /* start of trg */
                            trg = ptr;
                            trglen = ptrlen;
                            state = 3;
                            break;
                        case 3:
                            /* continue trg */
                            trglen += ptrlen;
                            break;
                        default:
                            /* bogus line format */
                            state = -1;
                            break;
                    }
                }

                if (state >= 3)
                    rootSuffixTree = placeChar(rootSuffixTree, (unsigned char*)src, srclen, trg, trglen);

                pfree(line);
            }
            skip = false;
        }
        PG_CATCH();
        {
            ErrorData* errdata = NULL;
            MemoryContext ecxt;

            ecxt = MemoryContextSwitchTo(ccxt);
            errdata = CopyErrorData();
            if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) {
                FlushErrorState();
            } else {
                MemoryContextSwitchTo(ecxt);
                PG_RE_THROW();
            }
        }
        PG_END_TRY();
    } while (skip);

    tsearch_readline_end(&trst);

    return rootSuffixTree;
}

/*
 * findReplaceTo - find multibyte character in tree
 */
static SuffixChar* findReplaceTo(SuffixChar* node, unsigned char* src, int srclen)
{
    while (node) {
        node = node + *src;
        if (srclen == 1)
            return node;

        src++;
        srclen--;
        node = node->nextChar;
    }

    return NULL;
}

PG_FUNCTION_INFO_V1(unaccent_init);
extern "C" Datum unaccent_init(PG_FUNCTION_ARGS);
Datum unaccent_init(PG_FUNCTION_ARGS)
{
    List* dictoptions = (List*)PG_GETARG_POINTER(0);
    SuffixChar* rootSuffixTree = NULL;
    bool fileloaded = false;
    ListCell* l = NULL;

    foreach (l, dictoptions) {
        DefElem* defel = (DefElem*)lfirst(l);

        if (pg_strcasecmp("Rules", defel->defname) == 0) {
            if (fileloaded)
                ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple Rules parameters")));
            rootSuffixTree = initSuffixTree(defGetString(defel));
            fileloaded = true;
        } else {
            ereport(ERROR,
                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                    errmsg("unrecognized Unaccent parameter: \"%s\"", defel->defname)));
        }
    }

    if (!fileloaded) {
        ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("missing Rules parameter")));
    }

    PG_RETURN_POINTER(rootSuffixTree);
}

PG_FUNCTION_INFO_V1(unaccent_lexize);
extern "C" Datum unaccent_lexize(PG_FUNCTION_ARGS);
Datum unaccent_lexize(PG_FUNCTION_ARGS)
{
    SuffixChar* rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0);
    char* srcchar = (char*)PG_GETARG_POINTER(1);
    int32 len = PG_GETARG_INT32(2);
    char* srcstart = NULL;
    char* trgchar = NULL;
    int charlen;
    Size tarlen = 0;
    TSLexeme* res = NULL;
    SuffixChar* node = NULL;
    errno_t rc;

    srcstart = srcchar;
    while (srcchar - srcstart < len) {
        charlen = pg_mblen(srcchar);

        node = findReplaceTo(rootSuffixTree, (unsigned char*)srcchar, charlen);
        if (node && node->replaceTo) {
            if (!res) {
                /* allocate res only if it's needed */
                res = (TSLexeme*)palloc0(sizeof(TSLexeme) * 2);
                tarlen = (Size)len * pg_database_encoding_max_length() + 1; /* \0 */
                res->lexeme = trgchar = (char*)palloc(tarlen);
                res->flags = TSL_FILTER;
                if (srcchar != srcstart) {
                    rc = memcpy_s(trgchar, tarlen, srcstart, srcchar - srcstart);
                    securec_check_c(rc, "", "");
                    trgchar += (srcchar - srcstart);
                    tarlen -= (srcchar - srcstart);
                }
            }
            rc = memcpy_s(trgchar, node->replaceTo, node->replacelen);

            trgchar += node->replacelen;
        } else if (res) {
            rc = memcpy_s(trgchar, charlen, srcchar, charlen);
            securec_check_c(rc, "", "");
            trgchar += charlen;
            tarlen -= charlen;
        }

        srcchar += charlen;
    }

    if (res)
        *trgchar = '\0';

    PG_RETURN_POINTER(res);
}

/*
 * Function-like wrapper for dictionary
 */
PG_FUNCTION_INFO_V1(unaccent_dict);
extern "C" Datum unaccent_dict(PG_FUNCTION_ARGS);
Datum unaccent_dict(PG_FUNCTION_ARGS)
{
    text* str = NULL;
    int strArg;
    Oid dictOid;
    TSDictionaryCacheEntry* dict = NULL;
    TSLexeme* res = NULL;

    if (PG_NARGS() == 1) {
        dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
        strArg = 0;
    } else {
        dictOid = PG_GETARG_OID(0);
        strArg = 1;
    }
    str = PG_GETARG_TEXT_P(strArg);

    dict = lookup_ts_dictionary_cache(dictOid);

    res = (TSLexeme*)DatumGetPointer(FunctionCall4(&(dict->lexize),
        PointerGetDatum(dict->dictData),
        PointerGetDatum(VARDATA(str)),
        Int32GetDatum(VARSIZE(str) - VARHDRSZ),
        PointerGetDatum(NULL)));

    PG_FREE_IF_COPY(str, strArg);

    if (res == NULL) {
        PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
    } else if (res->lexeme == NULL) {
        pfree(res);
        PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
    } else {
        text* txt = cstring_to_text(res->lexeme);

        pfree(res->lexeme);
        pfree(res);

        PG_RETURN_TEXT_P(txt);
    }
}