forked from hyphanet/pyFreenet
-
Notifications
You must be signed in to change notification settings - Fork 4
/
copyweb
executable file
·86 lines (72 loc) · 3.57 KB
/
copyweb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python3
# encoding: utf-8
"""Copy a website into Freenet -- either a single page or the full site.
A bridge between wget and pyFreenet."""
import argparse
import os
import sys
import subprocess
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("pages", help="the URL to the website to copy", nargs="+")
parser.add_argument("-d", "--target-directory",
default=None,
help="target directory (default: host part of the path)")
parser.add_argument("--mirror", action="store_true",
default=False,
help="Mirror the whole site (default: only the pages explicitly given on the commandline)")
args = parser.parse_args()
wget_program = "wget"
wget_mode_option_lists = {
"mirror": [ # -nd ensures that the index.html is at the top-level. Better would be to only create sibling directories, though.
"-m", "-nc", "-k", "-p", "-np", "-nH", "-nd", "-E",
"--exclude-domains", "127.0.0.1,localhost",
"--no-check-certificate", "-e", "robots=off",
"-U", "'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.6) Gecko/20070802 SeaMonkey/1.1.4'"],
"single_page": [
"-t", "2", "-np", "-N", "-k", "-p", "-nd", "-nH", "-H", "-E",
"--exclude-domains", "127.0.0.1,localhost",
"--no-check-certificate", "-e", "robots=off",
"-U", "'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.6) Gecko/20070802 SeaMonkey/1.1.4'"],
}
wget_options = {
"target_directory": "--directory-prefix={}"
}
command = [wget_program]
if args.mirror:
command.extend(wget_mode_option_lists["mirror"])
else:
command.extend(wget_mode_option_lists["single_page"])
if args.target_directory:
command.append(wget_options["target_directory"].format(
args.target_directory))
# TODO: always require -d (avoids unnecessary interactivity here)
else:
is_right_directory = input(("You did not specify a target directory. "
"The site will be written in the current directory. "
"Are you in the directory in which the site should be written? "
"If not, please specify the target directory with -d <target directory>. "
"(current directory: {}) "
"(Yes/no) ").format(os.getcwd())).strip().lower() in ["", "y", "yes"]
if not is_right_directory:
sys.exit(1)
args.target_directory = os.path.abspath(os.getcwd())
if os.path.exists(args.target_directory):
if not input(("Target directory exists ({}). "
"Really write into it? "
"(Yes/no) ").format(args.target_directory)).strip().lower() in ["", "y", "yes"]:
sys.exit(1)
subprocess.call(command + args.pages)
# if we do not have an index.html, copy it.
def copy(source, target):
with open(source) as f:
with open(target, "w") as g:
g.write(f.read())
# guess the index name for sites which use FILENAME.html
if not os.path.isfile(os.path.join(args.target_directory, "index.html")):
maybe_target_file = args.pages[0].split("/")[-1]
if os.path.isfile(os.path.join(args.target_directory, maybe_target_file + ".html")):
copy(os.path.join(args.target_directory, maybe_target_file + ".html"),
os.path.join(args.target_directory, "index.html"))
elif os.path.isfile(os.path.join(args.target_directory, maybe_target_file)):
copy(os.path.join(args.target_directory, maybe_target_file),
os.path.join(args.target_directory, "index.html"))