run-llama · jerryjliu · Sep 7, 2023 · Aug 25, 2023 · Aug 25, 2023 · Aug 25, 2023
diff --git a/llama_hub/semanticscholar/README.md b/llama_hub/semanticscholar/README.md
@@ -4,6 +4,23 @@ Welcome to Semantic Scholar Loader. This module serves as a crucial utility for
 
 For any research topic you are interested in, this loader reads relevant papers from a search result in Semantic Scholar into `Documents`. 
 
+Please go through [demo_s2.ipynb](demo_s2.ipynb)
+
+## Some preliminaries -
+
+- `query_space` : broad area of research
+- `query_string` : a specific question to the documents in the query space
+
+**UPDATE** :
+
+To download the open access pdfs and extract text from them, simply mark the `full_text` flag as `True` :
+
+
+```python
+s2reader = SemanticScholarReader()
+documents = s2reader.load_data(query_space, total_papers, full_text=True)
+```
+
 ## Usage
 
 Here is an example of how to use this loader in `llama_index` and get citations for a given query.

diff --git a/llama_hub/semanticscholar/base.py b/llama_hub/semanticscholar/base.py
@@ -3,6 +3,8 @@
 from llama_index.readers.schema.base import Document
 import requests
 from typing import List
+import os
+
 
 
 class SemanticScholarReader(BaseReader):
@@ -19,19 +21,125 @@ class SemanticScholarReader(BaseReader):
         Loads data from Semantic Scholar based on the query and returned_fields
 
     """
+
 
-    def __init__(self):
+    def __init__(self, timeout=10, api_key=None, base_dir="pdfs"):
         """
         Instantiate the SemanticScholar object
         """
         from semanticscholar import SemanticScholar
+        import arxiv
+
+
+        self.arxiv = arxiv
+        self.base_dir = base_dir
+        self.s2 = SemanticScholar(timeout, api_key)
+        # check for base dir
+        if not os.path.exists(self.base_dir):
+            os.makedirs(self.base_dir)
+
+    def _clear_cache(self):
+        """
+        delete the .citation* folder
+        """
+        import shutil
+
+        shutil.rmtree("./.citation*")
+
+    def _download_pdf(self, paper_id, url: str, base_dir="pdfs"):
+        logger = logging.getLogger()
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+        }
+        # Making a GET request
+        response = requests.get(url, headers=headers, stream=True)
+        content_type = response.headers["Content-Type"]
+
+        # As long as the content-type is application/pdf, this will download the file
+        if "application/pdf" in content_type:
+            os.makedirs(base_dir, exist_ok=True)
+            file_path = os.path.join(base_dir, f"{paper_id}.pdf")
+            # check if the file already exists
+            if os.path.exists(file_path):
+                logger.info(f"{file_path} already exists")
+                return file_path
+            with open(file_path, "wb") as file:
+                for chunk in response.iter_content(chunk_size=1024):
+                    if chunk:
+                        file.write(chunk)
+            logger.info(f"Downloaded pdf from {url}")
+            return file_path
+        else:
+            logger.warning(f"{url} was not downloaded: protected")
+            return None
+
+    def _get_full_text_docs(self, documents: List[Document]) -> List[Document]:
+        from PyPDF2 import PdfReader
 
-        self.s2 = SemanticScholar()
+        """
+        Gets the full text of the documents from Semantic Scholar
+
+        Parameters
+        ----------
+        documents: list
+            The list of Document object that contains the search results
+
+        Returns
+        -------
+        list
+            The list of Document object that contains the search results with full text
+
+        Raises
+        ------
+        Exception
+            If there is an error while getting the full text
+
+        """
+        full_text_docs = []
+        for paper in documents:
+            metadata = paper.extra_info
+            url = metadata["openAccessPdf"]
+            externalIds = metadata["externalIds"]
+            paper_id = metadata["paperId"]
+            file_path = None
+            persist_dir = os.path.join(self.base_dir, f"{paper_id}.pdf")
+            if url and not os.path.exists(persist_dir):
+                # Download the document first
+                file_path = self._download_pdf(metadata["paperId"], url, persist_dir)
+
+            if not url and externalIds and "ArXiv" in externalIds and not os.path.exists(persist_dir):
+                # download the pdf from arxiv
+                file_path = self._download_pdf_from_arxiv(
+                    paper_id, externalIds["ArXiv"]
+                )
+
+            # Then, check if it's a valid PDF. If it's not, skip to the next document.
+            if file_path:
+                try:
+                    pdf = PdfReader(open(file_path, "rb"))
+                except Exception as e:
+                    logging.error(
+                        f"Failed to read pdf with exception: {e}. Skipping document..."
+                    )
+                    continue
+
+                text = ""
+                for page in pdf.pages:
+                    text += page.extract_text()
+                full_text_docs.append(Document(text=text, extra_info=metadata))
+
+        return full_text_docs
+
+    def _download_pdf_from_arxiv(self, paper_id, arxiv_id):
+        paper = next(self.arxiv.Search(id_list=[arxiv_id], max_results=1).results())
+        paper.download_pdf(dirpath=self.base_dir, filename=paper_id + ".pdf")
+        return os.path.join(self.base_dir, f"{paper_id}.pdf")
 
     def load_data(
         self,
         query,
-        limit=10,
+        limit,
+        full_text=False,
         returned_fields=[
             "title",
             "abstract",
@@ -41,6 +149,7 @@ def load_data(
             "citationCount",
             "openAccessPdf",
             "authors",
+            "externalIds",
         ],
     ) -> List[Document]:
         """
@@ -80,7 +189,7 @@ def load_data(
         documents = []
 
         for item in results[:limit]:
-            openaccesspdf = getattr(item, "openAccessPdf", None)
+            openAccessPdf = getattr(item, "openAccessPdf", None)
             abstract = getattr(item, "abstract", None)
             title = getattr(item, "title", None)
             text = None
@@ -96,9 +205,14 @@ def load_data(
                 "year": getattr(item, "year", None),
                 "paperId": getattr(item, "paperId", None),
                 "citationCount": getattr(item, "citationCount", None),
-                "openAccessPdf": openaccesspdf.get("url") if openaccesspdf else None,
+                "openAccessPdf": openAccessPdf.get("url") if openAccessPdf else None,
                 "authors": [author["name"] for author in getattr(item, "authors", [])],
+                "externalIds": getattr(item, "externalIds", None),
             }
             documents.append(Document(text=text, extra_info=metadata))
 
+
+        if full_text:
+            full_text_documents = self._get_full_text_docs(documents)
+            documents.extend(full_text_documents)
         return documents