-
Notifications
You must be signed in to change notification settings - Fork 0
/
remove_dupes.py
executable file
·138 lines (128 loc) · 4.81 KB
/
remove_dupes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/python3
import os
import pyselect
import sys
import difflib
import datetime
import anime
from sqlalchemy import func
import argparse
import configparser
import sys
import re
def get_config(config_file: str = "weltschmerz.cfg"):
config = configparser.ConfigParser()
config.read_file(open(config_file))
parser = argparse.ArgumentParser(description="Remove dupes from db and disk.")
parser.add_argument(
"--database", help="database to use", default=config.get("client", "database")
)
parser.add_argument(
"--log-file",
dest="log_file",
help="logfile to use",
default=config.get("client", "log"),
)
parser.add_argument(
"--preferred-directory-pattern",
dest="preferred_directory_pattern",
help="on duplicates with only one filename, prefer to keep files in directory matching the pattern",
)
parser.add_argument(
"--dry-run",
"-n",
help="dry-run, no removing deleted files",
default=False,
dest="dry_run",
action="store_true",
)
args = parser.parse_args()
return args
def update_mtimes(files):
available_files = [file for file in files if os.path.isfile(file.full_path)]
if len(available_files) > 0:
oldest_mtime = os.path.getmtime(available_files[0].full_path)
for file in available_files:
mtime = os.path.getmtime(file.full_path)
if mtime <= oldest_mtime:
oldest_mtime = mtime
for file in available_files:
if os.path.getmtime(file.full_path) != oldest_mtime:
print(
f"setting atime/mtime to {datetime.datetime.fromtimestamp(oldest_mtime).isoformat()} for {file.full_path}"
)
os.utime(file.full_path, (oldest_mtime, oldest_mtime))
return available_files
if __name__ == "__main__":
config = get_config()
dbs = anime.DatabaseSession(config.database, False)
known_dupes = (
dbs.session.query(anime.LocalFile.hash_ed2k, anime.LocalFile.filesize)
.group_by(anime.LocalFile.hash_ed2k, anime.LocalFile.filesize)
.having(func.count() >= 2)
.all()
)
# print(known_dupes)
# sys.exit(1)
for i, (hash_ed2k, filesize) in enumerate(known_dupes, start=1):
known_files = (
dbs.session.query(anime.LocalFile)
.filter(
anime.LocalFile.hash_ed2k == hash_ed2k,
anime.LocalFile.filesize == filesize,
)
.order_by(anime.LocalFile.filename, anime.LocalFile.directory)
.all()
)
print("\n" * 8 + "#" * 128 + "\n")
available_files = update_mtimes(known_files)
if len(available_files) <= 1:
continue
print("[ {current:05} / {total:05} ]".format(current=i, total=len(known_dupes)))
file_names = list(
set([available_file.filename for available_file in available_files])
)
print(file_names)
if len(file_names) == 1:
selection = available_files[0]
# keep files from directory matching preferred pattern
if config.preferred_directory_pattern:
print("preferred pattern found")
for available_file in available_files:
if re.match(
config.preferred_directory_pattern, available_file.directory
):
print(
f'selecting file from preferred location: "{available_file.directory}" - "{available_file.filename}"'
)
selection = available_file
break
print("File to keep:")
print(f"{selection.filename} ({selection.directory})")
else:
sys.stdout.writelines(
difflib.context_diff(
file_names[0], file_names[1], fromfile="a", tofile="b"
)
)
print()
print("File to keep:")
# continue
selection = pyselect.select(
available_files, 'f"{option.filename} ({option.directory})"'
)
# keep only files to remove in array
available_files.remove(selection)
print([file.full_path for file in available_files])
for file in available_files:
if os.path.isfile(selection.full_path) and os.path.realpath(
selection.full_path
) != os.path.realpath(file.full_path):
print(f"deleting file: {file.full_path}")
if not config.dry_run:
os.remove(file.full_path)
print(f"removing file from db: {file.full_path}")
if not config.dry_run:
dbs.session.delete(file)
if not config.dry_run:
dbs.session.commit()