Skip to content

Commit

Permalink
commit
Browse files Browse the repository at this point in the history
  • Loading branch information
taishan1994 committed Aug 17, 2022
1 parent 186d85c commit 76e0d6d
Show file tree
Hide file tree
Showing 7 changed files with 76,526 additions and 0 deletions.
41 changes: 41 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,47 @@ data_name是数据集的名字,text_repeat是每条文本生成文本的数量

除了数据量不一样,其余的参数均设置为一致。

# 补充分词实例

数据来源:链接: https://pan.baidu.com/s/1gvtqpjz05BglTy597AqbKQ?pwd=xuvp 提取码: xuvp 。具体实验过程参考其它数据集说明。

```python
python main.py \
--bert_dir="../model_hub/chinese-bert-wwm-ext/" \
--data_dir="./data/sighan2005/" \
--data_name='sighan2005' \
--log_dir="./logs/" \
--output_dir="./checkpoints/" \
--num_tags=5 \
--seed=123 \
--gpu_ids="0" \
--max_seq_len=512 \
--lr=3e-5 \
--crf_lr=3e-2 \
--other_lr=3e-4 \
--train_batch_size=16 \
--train_epochs=3 \
--eval_batch_size=16 \
--lstm_hidden=128 \
--num_layers=1 \
--use_lstm='False' \
--use_crf='True' \
--dropout_prob=0.3 \
--dropout=0.3

precision:0.9667 recall:0.9549 micro_f1:0.9608
precision recall f1-score support

word 0.97 0.95 0.96 104371

micro-f1 0.97 0.95 0.96 104371

在1998年来临之际,我十分高兴地通过中央人民广播电台、中国国际广播电台和中央电视台,向全国各族人民,向香港特别行政区同胞、澳门和台湾同胞、海外侨胞,向世界各国的朋友们,致以诚挚的问候和良好的祝愿!
Load ckpt from ./checkpoints/bert_crf_sighan2005/model.pt
Use single gpu in: ['0']
{'word': [('', 0), ('1998年', 1), ('来临', 6), ('之际', 8), ('', 10), ('', 11), ('十分', 12), ('高兴', 14), ('', 16), ('通过', 17), ('中央', 19), ('人民', 21), ('广播', 23), ('电台', 25), ('', 27), ('中国', 28), ('国际', 30), ('广播', 32), ('电台', 34), ('', 36), ('中央', 37), ('电视台', 39), ('', 42), ('', 43), ('全国', 44), ('各族', 46), ('人民', 48), ('', 50), ('', 51), ('香港', 52), ('特别', 54), ('行政区', 56), ('同胞', 59), ('', 61), ('澳门', 62), ('', 64), ('台湾', 65), ('同胞', 67), ('', 69), ('海外', 70), ('侨胞', 72), ('', 74), ('', 75), ('世界各国', 76), ('', 80), ('朋友', 81), ('', 83), ('', 84), ('致以', 85), ('诚挚', 87), ('', 89), ('问候', 90), ('', 92), ('良好', 93), ('', 95), ('祝愿', 96), ('', 98)]}
```

# 补充商品标题要素抽取实例

数据来源:[商品标题](https://www.heywhale.com/mw/dataset/6241349d93e61600170895e5/file),就一个train.txt,初始格式为BIO。具体实验过程参考其它数据集说明。这里并没有运行完3个epoch,在720步手动终止了。类别数据进行了脱敏,要知道每类是什么意思,只有自己根据数据自己总结了=,=。
Expand Down
108 changes: 108 additions & 0 deletions data/sighan2005/raw_data/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import json
from pprint import pprint

max_seq_len= 512
# -1:索引为0-511,-2:去掉CLS和SEP,+1:词索引到下一位
max_seq_len = max_seq_len - 1 - 2 + 1

with open("training.txt", "r", encoding="utf-8") as fp:
data = fp.readlines()

res = []
i = 0
for d in data:
d = d.strip().split(" ")
dtype = "word"
start = 0
tmp = []
labels = []
j = 0
for word in d:
start = len("".join(tmp))
tmp.append(word)
end = start + len(word)
labels.append(["T{}".format(j), dtype, start, end, word])
j += 1
if end > max_seq_len:
sub_tmp = tmp[:-1]
sub_labels = labels[:-1]
end = start + len("".join(sub_tmp))
text = "".join(sub_tmp)
res.append({
"id": i,
"text": text,
"labels": sub_labels
})

start = 0
tmp = [word]
end = len("".join(tmp))
labels = [["T{}".format(0), dtype, 0, end, word]]
i += 1

if tmp:
text = "".join(tmp)
res.append({
"id": i,
"text": text,
"labels": labels
})
i += 1

with open("../mid_data/train.json", 'w', encoding="utf-8") as fp:
json.dump(res, fp, ensure_ascii=False)

labels = ["word"]
with open("../mid_data/labels.json", 'w', encoding="utf-8") as fp:
json.dump(labels, fp, ensure_ascii=False)

nor_ent2id = {"O":0, "B-word":1, "I-word":2, "E-word":3, "S-word":4}
with open("../mid_data/nor_ent2id.json", 'w', encoding="utf-8") as fp:
json.dump(nor_ent2id, fp, ensure_ascii=False)

with open("test.txt", "r", encoding="utf-8") as fp:
data = fp.readlines()

res = []
i = 0
for d in data:
d = d.strip().split(" ")
dtype = "word"
start = 0
tmp = []
labels = []
j = 0
for word in d:
start = len("".join(tmp))
tmp.append(word)
end = start + len(word)
labels.append(["T{}".format(j), dtype, start, end, word])
j += 1
if end > max_seq_len:
sub_tmp = tmp[:-1]
sub_labels = labels[:-1]
end = start + len("".join(sub_tmp))
text = "".join(sub_tmp)
res.append({
"id": i,
"text": text,
"labels": sub_labels
})

start = 0
tmp = [word]
end = len("".join(tmp))
labels = [["T{}".format(0), dtype, 0, end, word]]
i += 1

if tmp:
text = "".join(tmp)
res.append({
"id": i,
"text": text,
"labels": labels
})
i += 1

with open("../mid_data/test.json", 'w', encoding="utf-8") as fp:
json.dump(res, fp, ensure_ascii=False)
Loading

0 comments on commit 76e0d6d

Please sign in to comment.