-
Notifications
You must be signed in to change notification settings - Fork 0
/
statics.py
47 lines (31 loc) · 1.01 KB
/
statics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import re
import numpy as np
import collections
def count_corpus(tokens):
"""Count token frequencies.
Defined in :numref:`sec_text_preprocessing`"""
# Here `tokens` is a 1D list or 2D list
if len(tokens) == 0 or isinstance(tokens[0], list):
# Flatten a list of token lists into a list of tokens
tokens = [token for line in tokens for token in line]
return collections.Counter(tokens)
def load_data_statis(X_train_data,is_train=False):
d = [4, 20, 8, 2, 13, 7, 18, 6, 3, 1, 17, 11, 15, 14, 12, 10, 9, 5, 19, 16]
corpus = count_corpus(X_train_data)
l = []
# print(corpus)
for j in (d):
s_l = corpus[j] / len(X_train_data)
l.append(s_l)
# else:
# statis_tr = []
# for i in X_train_data:
#
# corpus = count_corpus(i)
# l = []
# # print(corpus)
# for j in (d):
# s_l = corpus[j] / len(i)
# l.append(s_l)
# statis_tr.append(l)
return l