-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_dump.py
137 lines (114 loc) · 4.49 KB
/
test_dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import argparse
import os
import pathlib
import shutil
import subprocess
import sys
import yaml
import logging
import time
from PIL import Image
import imagehash
from cyoa_archives.grist.api import GristAPIWrapper
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG)
# Parse args
parser = argparse.ArgumentParser(
description="Parse a subreddit for submissions using praw."
)
parser.add_argument("-c", "--config_file", help="Configuration file to use")
parser.add_argument("-d", "--database_folder", help="Folder to use as database")
parser.add_argument("-t", "--temporary_folder", help="Folder to use to temporarily keep files")
args = parser.parse_args()
if args.config_file:
filepath = pathlib.Path(args.config_file)
try:
with open(filepath) as f:
config = yaml.safe_load(f)
except OSError:
print(f"Could not read file: {filepath}")
sys.exit(1)
# If the database folder does not exist, create it
dbdir = pathlib.Path(args.database_folder)
tempdir = pathlib.Path(args.temporary_folder)
if not dbdir.exists():
logger.info(f'Making database folder at: {dbdir.resolve()}')
os.makedirs(dbdir)
# Set up API
api = GristAPIWrapper.from_config(config.get('grist'))
grist_pd = api.fetch_table_pd('Records', col_names=[
'id', 'cyoa_uuid', 'is_cyoa', 'static_url', 'interactive_url', 'broken_link', 'image_hashes2', 'created_utc',
'cyoa', 'link_flair_text', 'title'
])
cyoa_pd = grist_pd.loc[grist_pd['is_cyoa'].eq('Yes')].sort_values(by=['created_utc'], ascending=False)
logger.debug(len(cyoa_pd))
result_list = []
count = 0
for index, row in cyoa_pd.iterrows():
g_id = row['id']
cyoa_uuid = row['cyoa_uuid']
is_cyoa = row['is_cyoa']
static_url = row['static_url']
interactive_url = row['interactive_url']
broken_link = row['broken_link']
image_hashes = row['image_hashes']
flair = row['link_flair_text']
title = row['title']
try:
# Skip irrelevant rows
if not static_url or broken_link or image_hashes or interactive_url:
continue
# Skip posts flagged as interactive
if flair and 'Interactive' in flair:
continue
# Empty temporary directory
if tempdir.exists():
logger.info(f'Deleting directory: {tempdir.resolve()}')
shutil.rmtree(tempdir.resolve())
os.makedirs(tempdir)
# Download using gallery-dl
subprocess.run(['gallery-dl', static_url, '-d', tempdir.resolve(), '--range', '1-100'], universal_newlines=True)
image_paths = []
for extension in ['*.png', '*.jpg', '*.jpeg', '*.webp']:
for image_path in tempdir.rglob(extension):
image_paths.append(image_path)
logger.debug(image_paths)
# Now run hashing algorithm on all images in the temporary directory
hash_list = []
for i, image in enumerate(image_paths):
# Hash the image (Let's use average hash because it's less tolerant)
img = Image.open(image)
image_hash = imagehash.average_hash(img)
color_hash = imagehash.colorhash(img, binbits=3)
image_hash_str = str(image_hash) + '_' + str(color_hash)
hash_list.append(image_hash_str)
# If it's an imgur image, save it
if 'imgur.' in static_url:
if cyoa_uuid:
cyoa_directory = pathlib.Path.joinpath(dbdir, cyoa_uuid)
else:
cyoa_directory = pathlib.Path.joinpath(dbdir, title.replace('/', '').strip())
if not cyoa_directory.exists():
logger.info(f'Making database folder at: {cyoa_directory.resolve()}')
os.makedirs(cyoa_directory)
image_path_copy = pathlib.Path.joinpath(cyoa_directory, image.stem + image.suffix)
shutil.copyfile(image, image_path_copy)
# Detect no downloads
is_broken_link = False
if not image_paths:
is_broken_link = True
# Append results
result_list.append({
'id': g_id,
'image_hashes': ', '.join(hash_list),
'broken_link': is_broken_link
})
except:
logger.warning(f'Unable to has image: {static_url}')
time.sleep(3)
count = count + 1
if count % 25 == 0:
api.update_records('Records', result_list, mock=False, prompt=False)
hash_list = []
# Update grist
api.update_records('Records', result_list, mock=False, prompt=False)