-
Notifications
You must be signed in to change notification settings - Fork 0
/
random_text.py
167 lines (133 loc) · 5.02 KB
/
random_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from collections import defaultdict
from random import random
def make_word(probs):
"""Makes a random word based on the probability dictionary passed in."""
def get_letter(random_float, cumulative_probs):
sort_freq = sorted(
[(value, key) for (key, value) in cumulative_probs.iteritems()])
for value, key in sort_freq:
if value < random_float:
continue
return key
assert False
first = get_letter(random(), probs['^'])
previous = get_letter(random(), probs['^' + first])
letter_lis = [first, previous]
previous = '^' + first + previous
while previous[-1] != '$':
new = get_letter(random(), probs[previous])
letter_lis.append(new)
previous = previous[-2:] + new
return ''.join(letter_lis[:-1])
def build_count_dict(max_lines=1000):
"""Reads through shakespeare.txt and counts how many times a given letter
appears after every present two-letter combination.
"""
def update_count_dict(word, count_dict):
word = word.lower()
first_letter = word[0]
count_dict['^'][first_letter] += 1
if len(word) > 1:
count_dict['^' + first_letter][word[1]] += 1
word = '^' + word + '$'
for i in xrange(len(word) - 3):
count_dict[word[i:i + 3]][word[i + 3]] += 1
def process_line(line, count_dict):
line = line.strip()
if not line:
return
for word in line.split(' '):
if word.upper() == word:
return
for char in '()[]':
if char in word:
return
update_count_dict(word, count_dict)
defaultdict_int_maker = lambda: defaultdict(int)
count_dict = defaultdict(defaultdict_int_maker)
lines_count = 0
with open('shakespeare.txt', 'r') as text:
# Skip first 300 lines that contain copyright info
for i in xrange(300):
text.next()
for line in text:
lines_count += 1
if lines_count > max_lines:
break
process_line(line, count_dict)
return count_dict
def build_probability_dict(count_dict):
"""Turn a dictionary that counts the frequency of letters into a dictionary
of probabilities.
"""
def convert_counts_to_probabilities(counts):
total = sum(counts.values())
probabilities = {}
for key, value in counts.iteritems():
probabilities[key] = float(value) / float(total)
return probabilities
probability_dict = {}
for key, value in count_dict.iteritems():
probability_dict[key] = convert_counts_to_probabilities(value)
return probability_dict
def make_cumulative(probabilities):
"""Takes a dictionary mapping
{
character(*): {
character(**): probability of following the original character(*)
}
}
The function converts these probabilities to cumulative probabilities, in
order to make it simple to select one character randomly based on its
probability of following other characters.
"""
for key, probability_dict in probabilities.iteritems():
i_sum = 0.0
for key, value in probability_dict.iteritems():
probability_dict[key] = i_sum + value
i_sum += value
def prime_probability_dict(max_lines=100):
"""Creates a useable probability dictionary to be used for generation of
words.
"""
count_dict = build_count_dict(max_lines=max_lines)
probability_dict = build_probability_dict(count_dict)
make_cumulative(probability_dict)
return probability_dict
def make_paragraph(probs, num_words=200):
"""Generates a paragraph of ``num_words`` words based on the probability
dictionary passed in.
"""
def handle_periods(paragraph, i):
"""Capitalize the beginnings of sentences."""
if paragraph[i] in '.?!':
try:
paragraph[i + 2] = paragraph[i + 2].upper()
except IndexError:
pass
def handle_i(paragraph, i):
"""handle making all the appropriate i's uppercase.
The parameter ``i`` should never be 0, since we raise that to uppercase
first.
"""
if paragraph[i - 1] == ' ' and paragraph[i + 1] in ' \'':
paragraph[i] = 'I'
def handle_end_punctuation(paragraph):
"""Adds a period to the end of the paragraph if appropriate."""
letter = paragraph[-1]
if letter in '!.?':
return
if letter in ',;':
paragraph[-1] = '.'
else:
paragraph.append('.')
paragraph = list(' '.join([make_word(probs) for i in xrange(num_words)]))
paragraph[0] = paragraph[0].upper()
for i, letter in enumerate(paragraph):
handle_periods(paragraph, i)
handle_i(paragraph, i)
handle_end_punctuation(paragraph)
return ''.join(paragraph)
if __name__ == '__main__':
probability_dict = prime_probability_dict(max_lines=10000)
print make_paragraph(probability_dict)