forked from barrucadu/markov
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenise.py
72 lines (58 loc) · 2.11 KB
/
tokenise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import sys
class Tokeniser:
"""Flexible tokeniser for the Markov chain.
"""
def __init__(self, stream=None, noparagraphs=False):
self.stream = sys.stdin if stream is None else stream
self.noparagraphs = noparagraphs
def __iter__(self):
self.buffer = ''
self.tok = ''
self.halt = False
return self
def __next__(self):
while not self.halt:
# Return a pending token, if we have one
if self.tok:
out = self.tok
self.tok = ''
return out
# Read the next character. If EOF, return what we have in the
# buffer as the final token. Set a flag so we know to terminate
# after this point.
try:
next_char = next(self.stream)
except:
next_char = ''
self.halt = True
if not self.buffer:
break
# Determine if we have a new token
out = None
if self.buffer:
cout = False
if self.buffer == '\n' and next_char == '\n':
# Paragraph break
if not self.noparagraphs:
out = self.buffer + next_char
next_char = ''
elif not self.buffer.isspace() and next_char.isspace():
# A word
out = self.buffer
# If the next_char is a token, save it
if cout:
self.tok = next_char
next_char = ''
# If a token has been found, reset the buffer
if out:
self.buffer = ''
# If the buffer is only spaces, clear it when a word is added
if self.buffer.isspace() and not next_char.isspace():
self.buffer = next_char
else:
self.buffer += next_char
# Return the found token
if out:
return out
# If we're here, we got nothing but EOF.
raise StopIteration