Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
ZTurboX committed May 5, 2019
0 parents commit 155fab1
Show file tree
Hide file tree
Showing 18 changed files with 326,509 additions and 0 deletions.
12 changes: 12 additions & 0 deletions .idea/NER.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions .idea/encodings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

484 changes: 484 additions & 0 deletions .idea/workspace.xml

Large diffs are not rendered by default.

Empty file added const.py
Empty file.
83 changes: 83 additions & 0 deletions data_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import json

def split_char(raw_data_file,split_char_file):
all_sentence=[]
with open(raw_data_file,'r',encoding='gbk') as f:
for line in f.readlines():
sentence = []
s = line.strip().split()
for item in s:
char = []
tag=[]
if item.find('/') != -1:
char = list(item[:item.find('/')])
sentence.extend(char)

else:
char = list(item)
sentence.extend(char)

all_sentence.append(sentence)
f.close()

with open(split_char_file,'a',encoding='utf-8') as fs:
for sentence in all_sentence:
s=' '.join(sentence)
fs.writelines(s)
fs.writelines('\n')
fs.close()


def convert_data(raw_data_file,data_file):
all_sentence = []
with open(raw_data_file, 'r', encoding='gbk') as f:
for line in f.readlines():
text={}
sentence = []
tags=[]
s = line.strip().split()
for item in s:
char = []
tag=[]
if item.find('/') != -1:
char = list(item[:item.find('/')])
sentence.extend(char)
if item[item.find('/') + 1:] == 'ns':
tag.append('B-LOC')
tag.extend(['I-LOC'] * (len(item[:item.find('/')]) - 1))
if item[item.find('/') + 1:] == 'nr':
tag.append('B-PER')
tag.extend(['I-PER'] * (len(item[:item.find('/')]) - 1))
if item[item.find('/') + 1:] == 'nt':
tag.append('B-ORG')
tag.extend(['I-ORG'] * (len(item[:item.find('/')]) - 1))
tags.extend(tag)
else:
char = list(item)
sentence.extend(char)
tags.extend(['O'] * len(item))
text["sentence"] = sentence
text["tags"] = tags
all_sentence.append(text)
f.close()

with open(data_file,'a',encoding='utf-8') as fs:
for sentence in all_sentence:
json.dump(sentence,fs,ensure_ascii=False)
fs.writelines('\n')
fs.close()


if __name__=='__main__':
raw_data_file='./raw_data/train.txt'
split_train_char_file='./raw_data/split_train_char.txt'
train_data_file='./raw_data/train.json'
#split_char(raw_data_file,split_train_char_file)
#convert_data(raw_data_file,train_data_file)







Empty file added main.py
Empty file.
Empty file added model.py
Empty file.
4,365 changes: 4,365 additions & 0 deletions raw_data/MSRA_Test_GB.flat

Large diffs are not rendered by default.

Loading

0 comments on commit 155fab1

Please sign in to comment.