-
Notifications
You must be signed in to change notification settings - Fork 0
/
words.py
111 lines (91 loc) · 2.67 KB
/
words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
'''
An analysis of the dialogue in Hamlet
1- Count the frequency of each word
'''
# Count the frequency of each word
# counts = {}
# with open('data/dialogue.txt') as f:
# for line in f:
# for word in line.split():
# counts[word] = counts.get(word,0) + 1
# # .get good in avoiding if conditions / try-except
#
# # show the first 3 most common words in "Hamlet"
# # best approach: from collections import Counter
# print sorted(counts.items(), key=lambda p: p[1], reverse=True)[:3]
# redone with Counter
from collections import Counter
import random
counts = Counter()
with open('data/dialogue.txt') as f:
for line in f:
for word in line.split():
counts[word] += 1 # the counter class uses the .get(key,0)
# easy way to get the most common #
# print counts.most_common(3)
# task 2
# Organize the unique words by first letter
# words = {} # first letter --> set of words
# with open('data/dialogue.txt') as f:
# for line in f:
# for word in line.split():
# initial = word[0]
# # we use the set-default
# words.setdefault(initial, set()).add(word)
# for your convenience
from collections import defaultdict
# warning, the defaultdict will never trigger a KeyError
words = defaultdict(set) # first letter --> set of words
with open('data/dialogue.txt') as f:
for line in f:
for word in line.split():
initial = word[0]
# we use the set-default
words[initial].add(word)
'''
task3
word --> list of words that follow
'''
# chain = defaultdict(list)
# last = None
# # Train
# with open('data/dialogue.txt') as f:
# for line in f:
# for word in line.split():
# chain[last].append(word)
# last = word
# Walk
# word = random.choice(list(chain))
# print word
#
# while word[-1] not in '.?!':
# word = random.choice(chain[word])
# print word,
# Improved Train
from collections import defaultdict
import random
def train(filename, size=1):
chain = defaultdict(list)
last = (None,) * size
with open(filename) as f:
for line in f:
for word in line.split():
chain[last].append(word)
last = last[1:] + (word,)
return chain
# Walk
def walk(chain):
last = random.choice(list(chain))
for word in last:
print word,
while word[-1] not in '.?!':
word = random.choice(chain[last])
print word,
last = last[1:], (word,)
def randomness(chain):
d = sum(len(set(options)) > 1
for options in chain.values())
return float(n) / len(chain)
if __name__ == '__main__':
chain = train('data/dialogue.txt', 4)
walk(chain)