-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
83 lines (68 loc) · 2.85 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
import tempfile
from io import StringIO, RawIOBase
import bson as bson
import happybase
from flask import Flask, request
from warcio import WARCWriter, StatusAndHeaders
hbase_host = 'localhost'
hbase_port = 9090
if 'HBASE_HOST' in os.environ:
hbase_host = os.environ['HBASE_HOST']
if 'HBASE_PORT' in os.environ:
hbase_port = int(os.environ['HBASE_PORT'])
app = Flask(__name__)
class BytesIOWrapper(RawIOBase):
def __init__(self, file, encoding='utf-8', errors='strict'):
self.file, self.encoding, self.errors = file, encoding, errors
self.buf = b''
def readinto(self, buf):
if not self.buf:
self.buf = self.file.read(4096).encode(self.encoding, self.errors)
if not self.buf:
return 0
length = min(len(buf), len(self.buf))
buf[:length] = self.buf[:length]
self.buf = self.buf[length:]
return length
def readable(self):
return True
@app.route('/', methods=['POST'])
def process_json():
content_type = request.headers.get('Content-Type')
if content_type == 'application/json':
json = request.json
connection = happybase.Connection(hbase_host, hbase_port)
connection.open()
table = connection.table('main')
new_file, path = tempfile.mkstemp()
with open(path, 'wb') as f:
writer = WARCWriter(f, gzip=True)
for uuid in json:
app.logger.info(uuid)
row = table.row(uuid)
if b'cf1:IF' not in row:
record = writer.create_warc_record(uuid, 'metadata',
payload=BytesIOWrapper(StringIO("Entry does not found.")))
else:
inter_format = bson.loads(row[b'cf1:IF'])
http_headers = StatusAndHeaders('200 OK', [], protocol='HTTP/1.0')
record = writer.create_warc_record(inter_format['url'], 'response',
payload=BytesIOWrapper(StringIO(str(row[b'cf1:plain-text']))),
http_headers=http_headers,
warc_headers_dict=inter_format['rec-headers'],
warc_content_type=inter_format['rec-headers']['Content-Type'])
writer.write_record(record)
def generate():
try:
with open(path, 'rb') as fr:
yield from fr
finally:
os.remove(path)
r = app.response_class(generate(), mimetype='application/warc')
r.headers.set('Content-Disposition', 'attachment', filename='export.warc.gz')
return r
else:
return 'Content-Type not supported!'
if __name__ == '__main__':
app.run()