Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Enhancement to Semantic Scholar Loader - full text reader #482

Merged
merged 9 commits into from
Sep 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions llama_hub/semanticscholar/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,23 @@ Welcome to Semantic Scholar Loader. This module serves as a crucial utility for

For any research topic you are interested in, this loader reads relevant papers from a search result in Semantic Scholar into `Documents`.

Please go through [demo_s2.ipynb](demo_s2.ipynb)

## Some preliminaries -

- `query_space` : broad area of research
- `query_string` : a specific question to the documents in the query space

**UPDATE** :

To download the open access pdfs and extract text from them, simply mark the `full_text` flag as `True` :


```python
s2reader = SemanticScholarReader()
documents = s2reader.load_data(query_space, total_papers, full_text=True)
```

## Usage

Here is an example of how to use this loader in `llama_index` and get citations for a given query.
Expand Down
124 changes: 119 additions & 5 deletions llama_hub/semanticscholar/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from llama_index.readers.schema.base import Document
import requests
from typing import List
import os



class SemanticScholarReader(BaseReader):
Expand All @@ -19,19 +21,125 @@ class SemanticScholarReader(BaseReader):
Loads data from Semantic Scholar based on the query and returned_fields

"""


def __init__(self):
def __init__(self, timeout=10, api_key=None, base_dir="pdfs"):
"""
Instantiate the SemanticScholar object
"""
from semanticscholar import SemanticScholar
import arxiv


self.arxiv = arxiv
self.base_dir = base_dir
self.s2 = SemanticScholar(timeout, api_key)
# check for base dir
if not os.path.exists(self.base_dir):
os.makedirs(self.base_dir)

def _clear_cache(self):
"""
delete the .citation* folder
"""
import shutil

shutil.rmtree("./.citation*")

def _download_pdf(self, paper_id, url: str, base_dir="pdfs"):
logger = logging.getLogger()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
# Making a GET request
response = requests.get(url, headers=headers, stream=True)
content_type = response.headers["Content-Type"]

# As long as the content-type is application/pdf, this will download the file
if "application/pdf" in content_type:
os.makedirs(base_dir, exist_ok=True)
file_path = os.path.join(base_dir, f"{paper_id}.pdf")
# check if the file already exists
if os.path.exists(file_path):
logger.info(f"{file_path} already exists")
return file_path
with open(file_path, "wb") as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
logger.info(f"Downloaded pdf from {url}")
return file_path
else:
logger.warning(f"{url} was not downloaded: protected")
return None

def _get_full_text_docs(self, documents: List[Document]) -> List[Document]:
from PyPDF2 import PdfReader

self.s2 = SemanticScholar()
"""
Gets the full text of the documents from Semantic Scholar

Parameters
----------
documents: list
The list of Document object that contains the search results

Returns
-------
list
The list of Document object that contains the search results with full text

Raises
------
Exception
If there is an error while getting the full text

"""
full_text_docs = []
for paper in documents:
metadata = paper.extra_info
url = metadata["openAccessPdf"]
externalIds = metadata["externalIds"]
paper_id = metadata["paperId"]
file_path = None
persist_dir = os.path.join(self.base_dir, f"{paper_id}.pdf")
if url and not os.path.exists(persist_dir):
# Download the document first
file_path = self._download_pdf(metadata["paperId"], url, persist_dir)

if not url and externalIds and "ArXiv" in externalIds and not os.path.exists(persist_dir):
# download the pdf from arxiv
file_path = self._download_pdf_from_arxiv(
paper_id, externalIds["ArXiv"]
)

# Then, check if it's a valid PDF. If it's not, skip to the next document.
if file_path:
try:
pdf = PdfReader(open(file_path, "rb"))
except Exception as e:
logging.error(
f"Failed to read pdf with exception: {e}. Skipping document..."
)
continue

text = ""
for page in pdf.pages:
text += page.extract_text()
full_text_docs.append(Document(text=text, extra_info=metadata))

return full_text_docs

def _download_pdf_from_arxiv(self, paper_id, arxiv_id):
paper = next(self.arxiv.Search(id_list=[arxiv_id], max_results=1).results())
paper.download_pdf(dirpath=self.base_dir, filename=paper_id + ".pdf")
return os.path.join(self.base_dir, f"{paper_id}.pdf")

def load_data(
self,
query,
limit=10,
limit,
full_text=False,
returned_fields=[
"title",
"abstract",
Expand All @@ -41,6 +149,7 @@ def load_data(
"citationCount",
"openAccessPdf",
"authors",
"externalIds",
],
) -> List[Document]:
"""
Expand Down Expand Up @@ -80,7 +189,7 @@ def load_data(
documents = []

for item in results[:limit]:
openaccesspdf = getattr(item, "openAccessPdf", None)
openAccessPdf = getattr(item, "openAccessPdf", None)
abstract = getattr(item, "abstract", None)
title = getattr(item, "title", None)
text = None
Expand All @@ -96,9 +205,14 @@ def load_data(
"year": getattr(item, "year", None),
"paperId": getattr(item, "paperId", None),
"citationCount": getattr(item, "citationCount", None),
"openAccessPdf": openaccesspdf.get("url") if openaccesspdf else None,
"openAccessPdf": openAccessPdf.get("url") if openAccessPdf else None,
"authors": [author["name"] for author in getattr(item, "authors", [])],
"externalIds": getattr(item, "externalIds", None),
}
documents.append(Document(text=text, extra_info=metadata))


if full_text:
full_text_documents = self._get_full_text_docs(documents)
documents.extend(full_text_documents)
return documents
Loading