Skip to content

Commit

Permalink
Support PostgreSQL 12 to 16; fix bug that won't break Korean Hangul i…
Browse files Browse the repository at this point in the history
…nto 2-gram (#6)

* Support PostgreSQL 12 to 16; fix bug that won't break Korean Hangul into 2-gram

Close #4
Close #3

Co-authored-by: Jimmy Huang <[email protected]>
  • Loading branch information
jimmy-shaojun and huangjimmy authored Oct 26, 2023
1 parent 77e7ec6 commit a627574
Show file tree
Hide file tree
Showing 8 changed files with 269 additions and 57 deletions.
24 changes: 0 additions & 24 deletions .github/workflows/postgress12.yml

This file was deleted.

65 changes: 65 additions & 0 deletions .github/workflows/postgress12_16.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
name: Build pg_cjk_parser for postgres 12 and 16

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]

jobs:
build_pg12:
runs-on: ubuntu-latest
services:
registry:
image: registry:2
ports:
- 5000:5000
steps:
- uses: actions/checkout@v4
-
name: Set up QEMU
uses: docker/setup-qemu-action@v3
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
driver-opts: network=host
-
name: Build and push
uses: docker/build-push-action@v5
with:
push: true
file: Dockerfile_pg12
tags: localhost:5000/postgres:12-dev
-
name: Run bash script to verify image postgres:12-dev
run: docker pull localhost:5000/postgres:12-dev && docker tag localhost:5000/postgres:12-dev postgres:12-dev && chmod +x ./postgres-12.sh && ./postgres-12.sh

build_pg16:
runs-on: ubuntu-latest
services:
registry:
image: registry:2
ports:
- 5000:5000
steps:
- uses: actions/checkout@v4
-
name: Set up QEMU
uses: docker/setup-qemu-action@v3
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
driver-opts: network=host
-
name: Build and push
uses: docker/build-push-action@v5
with:
push: true
file: Dockerfile_pg16
tags: localhost:5000/postgres:16-dev
-
name: Run bash script to verify image postgres:16-dev
run: docker pull localhost:5000/postgres:16-dev && docker tag localhost:5000/postgres:16-dev postgres:16-dev && chmod +x ./postgres-16.sh && ./postgres-16.sh

6 changes: 3 additions & 3 deletions Dockerfile → Dockerfile_pg12
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM postgres:11
FROM postgres:12
RUN apt-get update
RUN apt-get install -y postgresql-server-dev-all
RUN apt-get install -y gcc
RUN apt-get install -y postgresql-server-dev-12
RUN apt-get install -y gcc make
RUN apt-get install -y icu-devtools libicu-dev

RUN mkdir -p /root/parser
Expand Down
14 changes: 14 additions & 0 deletions Dockerfile_pg16
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM postgres:16
RUN apt-get update
RUN apt-get install -y postgresql-server-dev-16
RUN apt-get install -y gcc make
RUN apt-get install -y icu-devtools libicu-dev

RUN mkdir -p /root/parser
WORKDIR /root/parser
COPY pg_cjk_parser.c /root/parser/
COPY pg_cjk_parser.control /root/parser/
COPY Makefile /root/parser/
COPY pg_cjk_parser--0.0.1.sql /root/parser/
COPY zht2zhs.h /root/parser/
RUN make clean && make install
45 changes: 34 additions & 11 deletions Readme.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Postgres CJK Parser - pg_cjk_parser

Postgres CJK Parser pg_cjk_parser is a fts (full text search) parser derived from the default parser in PostgreSQL 11. When a postgres database uses utf-8 encoding, this parser supports all the features of the default parser while splitting CJK (Chinese, Japanese, Korean) characters into 2-gram tokens. If the database's encoding is not utf-8, the parser behaves just like the default parser.
Postgres CJK Parser pg_cjk_parser is a fts (full text search) parser derived from the default parser in PostgreSQL. When a postgres database uses utf-8 encoding, this parser supports all the features of the default parser while splitting CJK (Chinese, Japanese, Korean) characters into 2-gram tokens. If the database's encoding is not utf-8, the parser behaves just like the default parser.

Now pg_cjk_parser supports PostgreSQL 12 to 16.

## Introduction

Expand Down Expand Up @@ -59,16 +61,22 @@ You can build pg_cjk_parser in a docker container.

1. Clone this repository into your local computer, say in /home/user/pg_cjk_parser
2. Ener /home/user/pg_cjk_parser
3. Build the docker image postgres:11-dev
3. Build the docker image postgres:12-dev

To build this extension for PostgreSQL 12
```bash
docker build -t postgres:12-dev . -f Dockerfile_pg12
```

To build this extension for PostgreSQL 16
```bash
docker build -t postgres:11-dev .
docker build -t postgres:12-dev . -f Dockerfile_pg16
```

4. Run the following command

```bash
docker run -it --rm -v $(PWD):/root/code postgres:11-dev /bin/bash -c "cd /root/code && make clean && make"
docker run -it --rm -v $(pwd):/root/code postgres:12-dev /bin/bash -c "cd /root/code && make clean && make"
```

Then pg_cjk_parser.bc and pg_cjk_parser.so will be available in current directory (/home/user/pg_cjk_parser). You can manually install the parser to a PostgreSQL instances or you can install it as an extension.
Expand All @@ -79,11 +87,12 @@ You can manually install pg_cjk_parser or you can install it as an extension.

### Install as an extension

Let's say that you have an instance of PostgreSQL 11 running, either on a docker container on a server.
Let's say that you have an instance of PostgreSQL 12 running, either on a docker container on a server.
Make sure you have the following dependencies installed.

```bash
sudo apt-get install -y postgresql-server-dev-all
# replace 12 with 16 if you build this extension for pg 16
sudo apt-get install -y postgresql-server-dev-12
sudo apt-get install -y gcc
sudo apt-get install -y icu-devtools libicu-dev
```
Expand All @@ -103,6 +112,7 @@ Run the following command on the server
```bash
cd /home/user/parser
make clean && make install
sudo make USE_PGXS=1 install
```

Connect to your server via pgAdmin or other clients and then execute the following sql to create the pg_cjk_parser extension.
Expand Down Expand Up @@ -132,13 +142,18 @@ Now you can execute the sql demonstrated in the introduction section to see the

### Docker image

There is a Dockerfile in this repository which helps you build a docker image based on postgres:11.
There is a Dockerfile in this repository which helps you build a docker image based on postgres:12.

```bash
docker build -t postgres:11-dev .
docker build -t postgres:12-dev . -f Dockerfile_pg12
```

If you use this image to start an instance of postgres:11, all you need to do is to create the extension, search parser and configuration in pgAdmin.
There is also a Dockerfile in this repository which helps you build a docker image based on postgres:16.
```bash
docker build -t postgres:16-dev . -f Dockerfile_pg16
```

If you use this image to start an instance of postgres:12, all you need to do is to create the extension, search parser and configuration in pgAdmin.

Connect to your server via pgAdmin or other clients and then execute the following sql to create the pg_cjk_parser extension.

Expand Down Expand Up @@ -167,10 +182,10 @@ Now you can execute the sql demonstrated in the introduction section to see the

### Install manually

Suppose you have an docker instance of postgres name postgres_db_1 whose image is postgres:11.
Suppose you have an docker instance of postgres name postgres_db_1 whose image is postgres:12.

```bash
docker cp pg_cjk_parser.so postgres_db_1:/usr/lib/postgresql/11/lib/
docker cp pg_cjk_parser.so postgres_db_1:/usr/lib/postgresql/12/lib/
```

Connect to the postgres instance via pgAdmin or other clients and then execute the following sql
Expand Down Expand Up @@ -335,6 +350,14 @@ to_tsvector('Doraemnon Nobita「ドラえもん のび太の牧場物語」多
|-|-|-|-|-|
|'doraemnon':1 'nobita':2 'χψψωω':22 '「':3 '」':15 'えも':6 'のび':8 'の牧':11 'び太':9 'もん':7 'ドラ':4 'ラえ':5 '場物':13 '多拉':16 '大雄':21 '太の':10 '梦':18 '比大':20 '牧場':12 '物語':14 '野比':19|"'のび' & 'び太'"|"'野比' & '比大' & '大雄'"|true|true|

```sql
SELECT to_tsvector('大韩民国개인정보의 수집 및 이용 목적(「개인정보 보호법」 제15조)'), to_tsquery('「大韩民国개인정보');
```

|to_tsvector|to_tsquery|
|-|-|
| '15':21 '「':13 '」':19 '国개':4 '大韩':1 '民国':3 '韩民':2 '개인':5,14 '목적':12 '및':10 '보의':8 '보호':17 '수집':9 '이용':11 '인정':6,15 '정보':7,16 '제':20 '조':22 '호법':18|'「' & '大韩' & '韩民' & '民国' & '国개' & '개인' & '인정' & '정보'|

## License

### PG CJK Parser
Expand Down
56 changes: 37 additions & 19 deletions pg_cjk_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,7 @@ p_isnotCJK(TParser *prs){
else
c = (pg_wchar) *(prs->wstr + prs->state->poschar);

if (c >= 0x2E80 && c <= 0x9FFF){
if ((c >= 0x2E80 && c <= 0x9FFF) || (c >= 0xAC00 && c <= 0xD7A3)){
return 0;
}
for(int i=0; i<7; i++){
Expand Down Expand Up @@ -647,7 +647,7 @@ p_isCJK(TParser *prs){
c = (pg_wchar) *(prs->wstr + prs->state->poschar);


if (c >= 0x2E80 && c <= 0x9FFF){
if ((c >= 0x2E80 && c <= 0x9FFF) || (c >= 0xAC00 && c <= 0xD7A3)){
#ifdef WPARSER_TRACE
fprintf(stderr, "%x isCJK?", c); fprintf(stderr, " = true\n");
#endif
Expand Down Expand Up @@ -682,7 +682,7 @@ p_isCJK2gram(TParser *prs){
else
c = (pg_wchar) *(prs->wstr + prs->state->poschar);

if (c >= 0x3040 && c <= 0x9FFF){
if ((c >= 0x3040 && c <= 0x9FFF) || (c >= 0xAC00 && c <= 0xD7A3)){
//CJK Unified Ideographs
//a 2-gram token
return 1;
Expand Down Expand Up @@ -714,7 +714,7 @@ utf8_cjkCodePoint(char * s){
static void
utf8_setCjkCodePoint(char * s, unsigned int codePoint){

if(codePoint >= 0x2E80 && codePoint <= 0x9FFF){
if((codePoint >= 0x2E80 && codePoint <= 0x9FFF) || (codePoint >= 0xAC00 && codePoint <= 0xD7A3)){
s[0] = 0xE0 | (codePoint>>12);
s[1] = 0x80 | ((codePoint>>6) & 0x3F);
s[2] = 0x80 | (codePoint & 0x3F);
Expand Down Expand Up @@ -763,7 +763,7 @@ p_isCJK2gram_twice(TParser *prs){
return 0;
}

if (c >= 0x3040 && c <= 0x9FFF){
if ((c >= 0x3040 && c <= 0x9FFF) || (c >= 0xAC00 && c <= 0xD7A3)){
//CJK Unified Ideographs
//token as if it is a 2-gram
pg_wchar nc;
Expand All @@ -772,7 +772,7 @@ p_isCJK2gram_twice(TParser *prs){
else
nc = (pg_wchar) *(prs->wstr + prs->state->poschar);

if (nc >= 0x3040 && nc <= 0x9FFF){
if ((nc >= 0x3040 && nc <= 0x9FFF) || (nc >= 0xAC00 && nc <= 0xD7A3)){
#ifdef WPARSER_TRACE
fprintf(stderr, " %x %x is 2-gram state=", c, nc);
fprintf(stderr, "%d \n", prs->state->state);
Expand Down Expand Up @@ -873,7 +873,7 @@ p_isCJKunigram(TParser *prs){
fprintf(stderr, "p_isCJKunigram: current char = %x\n", c);
#endif

if (c >= 0x3040 && c <= 0x9FFF){
if ((c >= 0x3040 && c <= 0x9FFF) || (c >= 0xAC00 && c <= 0xD7A3)){
//CJK Unified Ideographs
//if it is surrounded by non-CJK chars or CJK unigrams,
//it is also unigram
Expand All @@ -886,12 +886,12 @@ p_isCJKunigram(TParser *prs){
#ifdef WPARSER_TRACE
fprintf(stderr, "p_isCJKunigram: next char = %x\n", c);
#endif
if(c < 0x3040|| c > 0x9FFF){
if( !((c >= 0x3040 && c <= 0x9FFF) || (c >= 0xAC00 && c <= 0xD7A3)) ){
c = p_prevChar(prs);
#ifdef WPARSER_TRACE
fprintf(stderr, "p_isCJKunigram: prev char = %x\n", c);
#endif
if(c < 0x3040 || c > 0x9FFF)return 1;
if( !((c >= 0x3040 && c <= 0x9FFF) || (c >= 0xAC00 && c <= 0xD7A3)) )return 1;
}
return 0;
}
Expand Down Expand Up @@ -2341,19 +2341,37 @@ typedef struct
int len;
} hlCheck;

static bool
#ifndef PG_VERSION_NUM
#error "Cannot determine which postgresql version to build against"
#endif

#if PG_VERSION_NUM < 130000
#define TSTernaryValue bool
#define TS_YES true
#define TS_NO false
#endif

/*
* TS_execute callback for matching a tsquery operand to headline words
*
* Note: it's tempting to report words[] indexes as pos values to save
* searching in hlCover; but that would screw up phrase matching, which
* expects to measure distances in lexemes not tokens.
*/
static TSTernaryValue
checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
{
int i;
hlCheck *checkval = (hlCheck *) opaque;
int i;

/* scan words array for matching items */
for (i = 0; i < checkval->len; i++)
{
if (checkval->words[i].item == val)
{
/* don't need to find all positions */
/* if data == NULL, don't need to report positions */
if (!data)
return true;
return TS_YES;

if (!data->pos)
{
Expand All @@ -2370,9 +2388,9 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
}

if (data && data->npos > 0)
return true;
return TS_YES;

return false;
return TS_NO;
}


Expand Down Expand Up @@ -2869,13 +2887,13 @@ prsd2_headline(PG_FUNCTION_ARGS)
char *val = defGetString(defel);

if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
max_words = pg_atoi(val, sizeof(int32), 0);
max_words = pg_strtoint32(val);
else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
min_words = pg_atoi(val, sizeof(int32), 0);
min_words = pg_strtoint32(val);
else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
shortword = pg_atoi(val, sizeof(int32), 0);
shortword = pg_strtoint32(val);
else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
max_fragments = pg_atoi(val, sizeof(int32), 0);
max_fragments = pg_strtoint32(val);
else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
prs->startsel = pstrdup(val);
else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
Expand Down
Loading

0 comments on commit a627574

Please sign in to comment.