Merge pull request #5 from soachishti/download-report-feature

Download report feature
soachishti · Oct 18, 2017 · 08705bf · 08705bf
2 parents 587762d + 6f4898e
commit 08705bf
Show file tree

Hide file tree

Showing 8 changed files with 99 additions and 39 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
 userid.txt
+submission/report.html
+submission/report/*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -32,7 +32,13 @@ m.addFilesByWildcard("submission/a01-*.py")
 
 url = m.send() # Submission Report URL
 
+print ("Report Url: " + url)
+
+# Save report file
 m.saveWebPage(url, "submission/report.html")
+
+# Download whole report locally including code diff links
+mosspy.download_report(url, "submission/report/", connections=8)
 ```
 
 ## Python Compatibility

diff --git a/moss_usage.py b/moss_usage.py
@@ -12,6 +12,11 @@
 
 m.addFilesByWildcard("submission/a01-*.py")
 
-url = m.send()
+url = m.send() 
 
+print ("Report URL: " + url)
+
+# Save report file
 m.saveWebPage(url, "submission/report.html")
+
+mosspy.download_report(url, "submission/report/", connections=8, log_level=10) # logging.DEBUG (20 to disable)
diff --git a/mosspy/__init__.py b/mosspy/__init__.py
@@ -1 +1,2 @@
-from .moss import Moss
+from .moss import Moss
+from .download_report import download_report
diff --git a/mosspy/download_report.py b/mosspy/download_report.py
@@ -0,0 +1,77 @@
+from threading import Thread
+import logging
+import os
+try:
+    from urllib.request import urlopen
+except ImportError:
+    from urllib2 import urlopen
+
+def process_url(url, urls, base_url, path):
+    from bs4 import BeautifulSoup # Backward compability, don't break Moss when bs4 not available.
+
+    logging.debug ("Processing URL: " + url)
+    response = urlopen(url)
+    html = response.read()
+    soup = BeautifulSoup(html, 'lxml')
+    file_name = os.path.basename(url)
+
+    if not file_name or len(file_name.split(".")) == 1: # Not file name eg. 123456789 or is None
+        file_name = "index.html"
+
+    for more_url in soup.find_all(['a', 'frame']):
+        if more_url.has_attr('href'):
+            link = more_url.get('href')
+        else:
+            link = more_url.get('src')
+
+        if link and (link.find("match") != -1): # Download only results urls
+            link = link.split('#')[0]  # remove fragment from url
+            basename = os.path.basename(link)
+
+            if basename == link: # Handling relative urls
+                link = base_url + basename
+
+            if more_url.name == "a":
+                more_url['href'] = basename
+            elif more_url.name == "frame":
+                more_url['src'] = basename
+
+            if link not in urls:                    
+                urls.append(link)
+
+    f = open(path + file_name, 'w')
+    f.write(str(soup)) # saving soup will save updated href
+    f.close()
+
+def download_report(url, path, connections = 4, log_level=logging.DEBUG):
+    logging.basicConfig(level=log_level)
+
+    if len(url) == 0:
+        raise Exception("Empty url supplied")
+
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    base_url = url + "/"
+    urls = [url]
+    threads = []
+
+    logging.debug("="*80)
+    logging.debug("Downloading Moss Report - URL: " + url) 
+    logging.debug("="*80)
+
+    # Handling thread
+    for url in urls:
+        t = Thread(target=process_url, args=[url, urls, base_url, path])
+        t.start()
+        threads.append(t)
+
+        if len(threads) == connections or len(urls) < connections:
+            for thread in threads:
+                thread.join()
+                threads.remove(thread)
+                break
+
+    logging.debug("Waiting for all threads to complete")
+    for thread in threads:
+        thread.join()
diff --git a/mosspy/moss.py b/mosspy/moss.py
@@ -1,6 +1,7 @@
 import os
 import socket
 import glob
+import logging
 
 try:
     from urllib.request import urlopen
@@ -132,9 +133,12 @@ def send(self):
         return response.decode().replace("\n","")
 
     def saveWebPage(self, url, path):
+        if len(url) == 0:
+            raise Exception("Empty url supplied")
+
         response = urlopen(url)
         content = response.read()
 
         f = open(path, 'w')
         f.write(content.decode())
-        f.close()
+        f.close()
diff --git a/setup.py b/setup.py
@@ -19,4 +19,5 @@
   download_url = 'https://github.com/soachishti/moss.py/releases', # I'll explain this in a second
   keywords = ['moss', 'similarity', 'detecting', 'plagiarism'], # arbitrary keywords
   classifiers = [],
+  install_requires=['beautifulsoup4==4.6.0'],
 )
diff --git a/submission/report.html b/submission/report.html