commit

taishan1994 · Aug 17, 2022 · 76e0d6d · 76e0d6d
1 parent 186d85c
commit 76e0d6d
Show file tree

Hide file tree

Showing 7 changed files with 76,526 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -73,6 +73,47 @@ data_name是数据集的名字，text_repeat是每条文本生成文本的数量
 
 除了数据量不一样，其余的参数均设置为一致。
 
+# 补充分词实例
+
+数据来源：链接: https://pan.baidu.com/s/1gvtqpjz05BglTy597AqbKQ?pwd=xuvp 提取码: xuvp 。具体实验过程参考其它数据集说明。
+
+```python
+python main.py \
+--bert_dir="../model_hub/chinese-bert-wwm-ext/" \
+--data_dir="./data/sighan2005/" \
+--data_name='sighan2005' \
+--log_dir="./logs/" \
+--output_dir="./checkpoints/" \
+--num_tags=5 \
+--seed=123 \
+--gpu_ids="0" \
+--max_seq_len=512 \
+--lr=3e-5 \
+--crf_lr=3e-2 \
+--other_lr=3e-4 \
+--train_batch_size=16 \
+--train_epochs=3 \
+--eval_batch_size=16 \
+--lstm_hidden=128 \
+--num_layers=1 \
+--use_lstm='False' \
+--use_crf='True' \
+--dropout_prob=0.3 \
+--dropout=0.3 
+
+precision:0.9667 recall:0.9549 micro_f1:0.9608
+          precision    recall  f1-score   support
+
+    word       0.97      0.95      0.96    104371
+
+micro-f1       0.97      0.95      0.96    104371
+
+在１９９８年来临之际，我十分高兴地通过中央人民广播电台、中国国际广播电台和中央电视台，向全国各族人民，向香港特别行政区同胞、澳门和台湾同胞、海外侨胞，向世界各国的朋友们，致以诚挚的问候和良好的祝愿！
+Load ckpt from ./checkpoints/bert_crf_sighan2005/model.pt
+Use single gpu in: ['0']
+{'word': [('在', 0), ('１９９８年', 1), ('来临', 6), ('之际', 8), ('，', 10), ('我', 11), ('十分', 12), ('高兴', 14), ('地', 16), ('通过', 17), ('中央', 19), ('人民', 21), ('广播', 23), ('电台', 25), ('、', 27), ('中国', 28), ('国际', 30), ('广播', 32), ('电台', 34), ('和', 36), ('中央', 37), ('电视台', 39), ('，', 42), ('向', 43), ('全国', 44), ('各族', 46), ('人民', 48), ('，', 50), ('向', 51), ('香港', 52), ('特别', 54), ('行政区', 56), ('同胞', 59), ('、', 61), ('澳门', 62), ('和', 64), ('台湾', 65), ('同胞', 67), ('、', 69), ('海外', 70), ('侨胞', 72), ('，', 74), ('向', 75), ('世界各国', 76), ('的', 80), ('朋友', 81), ('们', 83), ('，', 84), ('致以', 85), ('诚挚', 87), ('的', 89), ('问候', 90), ('和', 92), ('良好', 93), ('的', 95), ('祝愿', 96), ('！', 98)]}
+```
+
 # 补充商品标题要素抽取实例
 
 数据来源：[商品标题](https://www.heywhale.com/mw/dataset/6241349d93e61600170895e5/file)，就一个train.txt，初始格式为BIO。具体实验过程参考其它数据集说明。这里并没有运行完3个epoch，在720步手动终止了。类别数据进行了脱敏，要知道每类是什么意思，只有自己根据数据自己总结了=，=。

diff --git a/data/sighan2005/raw_data/process.py b/data/sighan2005/raw_data/process.py
@@ -0,0 +1,108 @@
+import json
+from pprint import pprint
+
+max_seq_len= 512
+# -1：索引为0-511，-2：去掉CLS和SEP，+1：词索引到下一位
+max_seq_len = max_seq_len - 1 - 2 + 1
+
+with open("training.txt", "r", encoding="utf-8") as fp:
+    data = fp.readlines()
+
+res = []
+i = 0
+for d in data:
+    d = d.strip().split("  ")
+    dtype = "word"
+    start = 0
+    tmp = []
+    labels = []
+    j = 0
+    for word in d:
+        start = len("".join(tmp))
+        tmp.append(word)
+        end = start + len(word)
+        labels.append(["T{}".format(j), dtype, start, end, word])
+        j += 1
+        if end > max_seq_len:
+            sub_tmp = tmp[:-1]
+            sub_labels = labels[:-1]
+            end = start + len("".join(sub_tmp))
+            text = "".join(sub_tmp)
+            res.append({
+                "id": i,
+                "text": text,
+                "labels": sub_labels
+            })
+
+            start = 0
+            tmp = [word]
+            end = len("".join(tmp))
+            labels = [["T{}".format(0), dtype, 0, end, word]]
+            i += 1
+
+    if tmp:
+        text = "".join(tmp)
+        res.append({
+            "id": i,
+            "text": text,
+            "labels": labels
+        })
+        i += 1
+
+with open("../mid_data/train.json", 'w', encoding="utf-8") as fp:
+    json.dump(res, fp, ensure_ascii=False)
+
+labels = ["word"]
+with open("../mid_data/labels.json", 'w', encoding="utf-8") as fp:
+    json.dump(labels, fp, ensure_ascii=False)
+
+nor_ent2id = {"O":0, "B-word":1, "I-word":2, "E-word":3, "S-word":4}
+with open("../mid_data/nor_ent2id.json", 'w', encoding="utf-8") as fp:
+    json.dump(nor_ent2id, fp, ensure_ascii=False)
+
+with open("test.txt", "r", encoding="utf-8") as fp:
+    data = fp.readlines()
+
+res = []
+i = 0
+for d in data:
+    d = d.strip().split("  ")
+    dtype = "word"
+    start = 0
+    tmp = []
+    labels = []
+    j = 0
+    for word in d:
+        start = len("".join(tmp))
+        tmp.append(word)
+        end = start + len(word)
+        labels.append(["T{}".format(j), dtype, start, end, word])
+        j += 1
+        if end > max_seq_len:
+            sub_tmp = tmp[:-1]
+            sub_labels = labels[:-1]
+            end = start + len("".join(sub_tmp))
+            text = "".join(sub_tmp)
+            res.append({
+                "id": i,
+                "text": text,
+                "labels": sub_labels
+            })
+
+            start = 0
+            tmp = [word]
+            end = len("".join(tmp))
+            labels = [["T{}".format(0), dtype, 0, end, word]]
+            i += 1
+
+    if tmp:
+        text = "".join(tmp)
+        res.append({
+            "id": i,
+            "text": text,
+            "labels": labels
+        })
+        i += 1
+
+with open("../mid_data/test.json", 'w', encoding="utf-8") as fp:
+    json.dump(res, fp, ensure_ascii=False)