From 1e5f43aa228ecd9df81f1105514cc1c464f86092 Mon Sep 17 00:00:00 2001 From: zhezhaoa <1152543959@qq.com> Date: Thu, 17 May 2018 01:14:55 +0800 Subject: [PATCH] Add files via upload --- ngram2vec/corpus2pairs.py | 22 ++++++++++++++----- ngram2vec/line2features.py | 45 ++++++++++++++++++++++++++++++++++---- 2 files changed, 58 insertions(+), 9 deletions(-) diff --git a/ngram2vec/corpus2pairs.py b/ngram2vec/corpus2pairs.py index e303543..117d086 100644 --- a/ngram2vec/corpus2pairs.py +++ b/ngram2vec/corpus2pairs.py @@ -6,7 +6,7 @@ from representations.matrix_serializer import load_count_vocabulary import six import sys -from line2features import ngram_ngram, word_word, word_text, word_wordLR, word_wordPos +from line2features import ngram_ngram, word_word, word_character, word_text, word_wordLR, word_wordPos def main(): @@ -15,15 +15,19 @@ def main(): corpus2pairs.py [options] Options: + --feature STR Co-occurrence types used for training [default: ngram-ngram] --win NUM Window size [default: 2] --sub NUM Subsampling threshold [default: 0] - --ngram_word NUM (Center) word vocabulary includes grams of 1st to nth order [default: 1] + --ngram_word NUM Word vocabulary includes grams of 1st to nth order [default: 1] --ngram_context NUM Context vocabulary includes grams of 1st to nth order [default: 1] + --ngram_char_low NUM The low bound of character ngram [default: 1] + --ngram_char_up NUM The up bound of character ngram [default: 4] --threads_num NUM The number of threads [default: 8] --overlap Whether overlaping pairs are allowed or not + --dynamic_win Whether dynamic window is allowed or not """) - print ("**********************") + print ("*********************************") print ("corpus2pairs") threads_num = int(args['--threads_num']) threads_list = [] @@ -38,9 +42,11 @@ def main(): def c2p(args, tid): pairs_file = open(args['']+"_"+str(tid), 'w') + feature = args['--feature'] #features, also known as co-occurrence types, are critical to the property of word representations. Supports ngram-ngram, word-word, word-character, and so on. threads_num = int(args['--threads_num']) subsample = float(args['--sub']) sub = subsample != 0 + vocab = load_count_vocabulary(args['']) #load vocabulary (generated in corpus2vocab stage) train_uni_num = 0 #number of (unigram) tokens in corpus for w, c in six.iteritems(vocab): @@ -61,8 +67,14 @@ def c2p(args, tid): sys.stdout.flush() if line_num % threads_num != tid: continue - ngram_ngram(line, args, vocab, pairs_file, sub, subsampler) - # word_word(line, args, vocab, pairs_file, sub, subsampler) + if feature == 'ngram-ngram': + ngram_ngram(line, args, vocab, pairs_file, sub, subsampler) + elif feature == 'word-word': #identical to word2vec + word_word(line, args, vocab, pairs_file, sub, subsampler) + elif feature == 'word-character': # similar with fasttext + word_character(line, args, vocab, pairs_file, sub, subsampler) + else: + break # word_text(line, args, vocab, pairs_file, sub, subsampler, line_num) # word_wordPos(line, args, vocab, pairs_file, sub, subsampler) diff --git a/ngram2vec/line2features.py b/ngram2vec/line2features.py index e126b15..c244271 100644 --- a/ngram2vec/line2features.py +++ b/ngram2vec/line2features.py @@ -1,14 +1,16 @@ from random import Random -import random from corpus2vocab import getNgram def ngram_ngram(line, args, vocab, pairs_file, sub, subsampler): + rnd = Random(17) win = int(args['--win']) ngram_word = int(args['--ngram_word']) ngram_context = int(args['--ngram_context']) overlap = args['--overlap'] - rnd = Random(17) + dynamic = args['--dynamic_win'] + if dynamic: + win = rnd.randint(1, win) #dynamic window tokens = line.strip().split() for i in range(len(tokens)): #loop for each position in a line for gram_word in range(1, ngram_word+1): #loop for grams of different orders in (center) word @@ -34,9 +36,9 @@ def ngram_ngram(line, args, vocab, pairs_file, sub, subsampler): def word_word(line, args, vocab, pairs_file, sub, subsampler): #identical to the word2vec toolkit; dynamic and dirty window! - win = int(args['--win']) - win = random.randint(1, win) #dynamic window rnd = Random(17) + win = int(args['--win']) + win = rnd.randint(1, win) #dynamic window tokens = [t if t in vocab else None for t in line.strip().split()] if sub: tokens = [t if t not in subsampler or rnd.random() > subsampler[t] else None for t in tokens] @@ -56,6 +58,41 @@ def word_word(line, args, vocab, pairs_file, sub, subsampler): #identical to the pairs_file.write(word + ' ' + context + "\n") +def word_character(line, args, vocab, pairs_file, sub, subsampler): #identical to the word2vec toolkit; dynamic and dirty window! + rnd = Random(17) + char_range = (int(args['--ngram_char_low']), int(args['--ngram_char_up'])) #character range + win = int(args['--win']) + dynamic = args['--dynamic_win'] + if dynamic: + win = rnd.randint(1, win) #dynamic window + tokens = [t if t in vocab else None for t in line.strip().split()] + if sub: + tokens = [t if t not in subsampler or rnd.random() > subsampler[t] else None for t in tokens] + for i in range(len(tokens)): #loop for each position in a line + word = getNgram(tokens, i, 1) + if word is None: + continue + start = i - win + end = i + win + for j in range(start, end + 1): + context = getNgram(tokens, j, 1) + if context is None: + continue + if i == j: + characters = [] + for character in context.decode('utf-8'): + characters.append(character) + for char_ngram in range(char_range[0], char_range[1] + 1): + for char_start in range(len(characters)): + char_end = char_start + char_ngram + if char_end > len(characters): + break + pairs_file.write(word + ' ' + ''.join([char.encode('utf-8') for char in characters[char_start: char_end]]) + "\n") + + continue + pairs_file.write(word + ' ' + context + "\n") + + def word_wordLR(line, args, vocab, pairs_file, sub, subsampler): win = int(args['--win']) rnd = Random(17)