-
Notifications
You must be signed in to change notification settings - Fork 1
/
lid_game.py
115 lines (90 loc) · 3.1 KB
/
lid_game.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
import os
from os.path import join as pjoin
import json
import random
from collections import namedtuple
from pyquery import PyQuery
import wapiti
from clastic import Application, json_response
from clastic.static import StaticApplication # mostly for dev
WikiLangInfo = namedtuple('WikiLangInfo',
'name, en_article_name, shortcode, article_count, '
'active_user_count, depth, description')
_CURDIR = os.path.abspath(os.path.dirname(__file__))
_STATIC_PATH = pjoin(_CURDIR, 'static')
def load_langs():
"""
File format:
[[u'English', u'English language', u'en', 4234378, 129657, 763], ...]
(Language name, English Wikipedia article, shortcode, number of articles,
number of active users, depth metric)
"""
ret = {}
with open(pjoin(_CURDIR, 'wikis.json')) as f:
wiki_lists = json.loads(f.read())
for wiki in wiki_lists:
wli = WikiLangInfo(*wiki)
ret[wli.name] = wli
return ret
_LANG_DICT = load_langs()
def get_text(element):
if hasattr(element, 'text_content'): # lxml 2
text = element.text_content()
else:
text = u''.join(element.itertext())
return text
def get_sample(page_text):
sample = page_text[:500]
if len(sample) == 500:
sample = sample.rsplit(None, 1)[0]
return sample + ' ...'
return sample
def get_random_page():
lang = random.choice(_LANG_DICT.keys())
lang_info = _LANG_DICT[lang]
#lang = 'Spanish' # a good way to debug
lang_url = 'http://%s.wikipedia.org/' % (lang_info.shortcode,)
lang_api_url = pjoin(lang_url, 'w/api.php')
wc = wapiti.WapitiClient('[email protected]',
api_url=lang_api_url)
pages = wc.get_random_articles(limit=1)
page_title = pages[0].title
page_url = pjoin(lang_url, 'wiki', page_title)
contents = wc.web_request_operation(page_url)
return lang, contents, page_title
def language_game(attempt=0):
choices = []
correct, contents, title = get_random_page()
choices.append(correct)
choices.extend(random.sample(_LANG_DICT.keys(), 4))
random.shuffle(choices)
decoded_contents = contents[0].decode('utf-8')
pq = PyQuery(decoded_contents)
# Is PyQuery even necessary?
content_div = pq('div#mw-content-text')
paragraphs = content_div.find('p')
try:
sample_p = paragraphs[0]
except IndexError:
# TODO: a much better retry
return language_game(attempt=attempt + 1)
page_text = get_text(sample_p)
sample = get_sample(page_text)
correct_info = _LANG_DICT[correct]._asdict()
ret = {
'correct': correct,
'correct_info': correct_info,
'choices': choices,
'sample': sample,
'title': title}
return ret
def create_game():
sappy = StaticApplication(_STATIC_PATH)
routes = [('/api/', language_game, json_response),
('/api/<path:_ignored>', language_game, json_response),
('/', sappy)]
return Application(routes)
game_app = create_game()
if __name__ == '__main__':
game_app.serve()