diff --git a/llama_hub/semanticscholar/README.md b/llama_hub/semanticscholar/README.md index 3010c578b9..04c804d313 100644 --- a/llama_hub/semanticscholar/README.md +++ b/llama_hub/semanticscholar/README.md @@ -4,6 +4,23 @@ Welcome to Semantic Scholar Loader. This module serves as a crucial utility for For any research topic you are interested in, this loader reads relevant papers from a search result in Semantic Scholar into `Documents`. +Please go through [demo_s2.ipynb](demo_s2.ipynb) + +## Some preliminaries - + +- `query_space` : broad area of research +- `query_string` : a specific question to the documents in the query space + +**UPDATE** : + +To download the open access pdfs and extract text from them, simply mark the `full_text` flag as `True` : + + +```python +s2reader = SemanticScholarReader() +documents = s2reader.load_data(query_space, total_papers, full_text=True) +``` + ## Usage Here is an example of how to use this loader in `llama_index` and get citations for a given query. diff --git a/llama_hub/semanticscholar/base.py b/llama_hub/semanticscholar/base.py index 1fbd131bfb..1bf5658c59 100644 --- a/llama_hub/semanticscholar/base.py +++ b/llama_hub/semanticscholar/base.py @@ -3,6 +3,8 @@ from llama_index.readers.schema.base import Document import requests from typing import List +import os + class SemanticScholarReader(BaseReader): @@ -19,19 +21,125 @@ class SemanticScholarReader(BaseReader): Loads data from Semantic Scholar based on the query and returned_fields """ + - def __init__(self): + def __init__(self, timeout=10, api_key=None, base_dir="pdfs"): """ Instantiate the SemanticScholar object """ from semanticscholar import SemanticScholar + import arxiv + + + self.arxiv = arxiv + self.base_dir = base_dir + self.s2 = SemanticScholar(timeout, api_key) + # check for base dir + if not os.path.exists(self.base_dir): + os.makedirs(self.base_dir) + + def _clear_cache(self): + """ + delete the .citation* folder + """ + import shutil + + shutil.rmtree("./.citation*") + + def _download_pdf(self, paper_id, url: str, base_dir="pdfs"): + logger = logging.getLogger() + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" + } + # Making a GET request + response = requests.get(url, headers=headers, stream=True) + content_type = response.headers["Content-Type"] + + # As long as the content-type is application/pdf, this will download the file + if "application/pdf" in content_type: + os.makedirs(base_dir, exist_ok=True) + file_path = os.path.join(base_dir, f"{paper_id}.pdf") + # check if the file already exists + if os.path.exists(file_path): + logger.info(f"{file_path} already exists") + return file_path + with open(file_path, "wb") as file: + for chunk in response.iter_content(chunk_size=1024): + if chunk: + file.write(chunk) + logger.info(f"Downloaded pdf from {url}") + return file_path + else: + logger.warning(f"{url} was not downloaded: protected") + return None + + def _get_full_text_docs(self, documents: List[Document]) -> List[Document]: + from PyPDF2 import PdfReader - self.s2 = SemanticScholar() + """ + Gets the full text of the documents from Semantic Scholar + + Parameters + ---------- + documents: list + The list of Document object that contains the search results + + Returns + ------- + list + The list of Document object that contains the search results with full text + + Raises + ------ + Exception + If there is an error while getting the full text + + """ + full_text_docs = [] + for paper in documents: + metadata = paper.extra_info + url = metadata["openAccessPdf"] + externalIds = metadata["externalIds"] + paper_id = metadata["paperId"] + file_path = None + persist_dir = os.path.join(self.base_dir, f"{paper_id}.pdf") + if url and not os.path.exists(persist_dir): + # Download the document first + file_path = self._download_pdf(metadata["paperId"], url, persist_dir) + + if not url and externalIds and "ArXiv" in externalIds and not os.path.exists(persist_dir): + # download the pdf from arxiv + file_path = self._download_pdf_from_arxiv( + paper_id, externalIds["ArXiv"] + ) + + # Then, check if it's a valid PDF. If it's not, skip to the next document. + if file_path: + try: + pdf = PdfReader(open(file_path, "rb")) + except Exception as e: + logging.error( + f"Failed to read pdf with exception: {e}. Skipping document..." + ) + continue + + text = "" + for page in pdf.pages: + text += page.extract_text() + full_text_docs.append(Document(text=text, extra_info=metadata)) + + return full_text_docs + + def _download_pdf_from_arxiv(self, paper_id, arxiv_id): + paper = next(self.arxiv.Search(id_list=[arxiv_id], max_results=1).results()) + paper.download_pdf(dirpath=self.base_dir, filename=paper_id + ".pdf") + return os.path.join(self.base_dir, f"{paper_id}.pdf") def load_data( self, query, - limit=10, + limit, + full_text=False, returned_fields=[ "title", "abstract", @@ -41,6 +149,7 @@ def load_data( "citationCount", "openAccessPdf", "authors", + "externalIds", ], ) -> List[Document]: """ @@ -80,7 +189,7 @@ def load_data( documents = [] for item in results[:limit]: - openaccesspdf = getattr(item, "openAccessPdf", None) + openAccessPdf = getattr(item, "openAccessPdf", None) abstract = getattr(item, "abstract", None) title = getattr(item, "title", None) text = None @@ -96,9 +205,14 @@ def load_data( "year": getattr(item, "year", None), "paperId": getattr(item, "paperId", None), "citationCount": getattr(item, "citationCount", None), - "openAccessPdf": openaccesspdf.get("url") if openaccesspdf else None, + "openAccessPdf": openAccessPdf.get("url") if openAccessPdf else None, "authors": [author["name"] for author in getattr(item, "authors", [])], + "externalIds": getattr(item, "externalIds", None), } documents.append(Document(text=text, extra_info=metadata)) + + if full_text: + full_text_documents = self._get_full_text_docs(documents) + documents.extend(full_text_documents) return documents diff --git a/llama_hub/semanticscholar/demo_s2.ipynb b/llama_hub/semanticscholar/demo_s2.ipynb new file mode 100644 index 0000000000..31933ab42b --- /dev/null +++ b/llama_hub/semanticscholar/demo_s2.ipynb @@ -0,0 +1,651 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Semantic Scholar Loader in llama-index" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_hub.semanticscholar.base import SemanticScholarReader\n", + "import os\n", + "import openai\n", + "from llama_index.llms import OpenAI\n", + "from llama_index.query_engine import CitationQueryEngine\n", + "from llama_index import (\n", + " VectorStoreIndex,\n", + " StorageContext,\n", + " load_index_from_storage,\n", + " ServiceContext,\n", + ")\n", + "from llama_index.response.notebook_utils import display_response\n", + "\n", + "# initialize the SemanticScholarReader\n", + "s2reader = SemanticScholarReader()\n", + "\n", + "# initialize the service context\n", + "openai.api_key = os.environ[\"OPENAI_API_KEY\"]\n", + "service_context = ServiceContext.from_defaults(\n", + " llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0)\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "**`Final Response:`** Large language models have limitations in terms of their training cost and computational resources [1]. While they can be efficient once trained, generating content from a trained model can still consume significant resources [1]. Techniques like model distillation can help reduce the cost of these models [1]. Additionally, increasing the size of language models may not necessarily improve their performance on long-tail knowledge or rare instances [3]. Scaling up models alone may not be sufficient to achieve high accuracy on specific types of questions [3]. There is also a need to modify the training objective or increase the number of training epochs to encourage memorization and focus on salient facts [4]. It is important to be cautious in how we talk about large language models, avoiding anthropomorphism and recognizing their limitations [5]." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 1/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 35028af6-85ea-4f55-a5b4-bfe11778cbb2
**Similarity:** 0.8679221353278955
**Text:** Source 1:\n", + "consume significant resources during training, they can be surprisingly efficient once tr...
**Metadata:** {'title': 'Language Models are Few-Shot Learners', 'venue': 'Neural Information Processing Systems', 'year': 2020, 'paperId': '6b85b63579a916f705a8e10a49bd8d849d91b1fc', 'citationCount': 14032, 'openAccessPdf': None, 'authors': ['Tom B. Brown', 'Benjamin Mann', 'Nick Ryder', 'Melanie Subbiah', 'J. Kaplan', 'Prafulla Dhariwal', 'Arvind Neelakantan', 'Pranav Shyam', 'Girish Sastry', 'Amanda Askell', 'Sandhini Agarwal', 'Ariel Herbert-Voss', 'Gretchen Krueger', 'T. Henighan', 'Rewon Child', 'A. Ramesh', 'Daniel M. Ziegler', 'Jeff Wu', 'Clemens Winter', 'Christopher Hesse', 'Mark Chen', 'Eric Sigler', 'Mateusz Litwin', 'Scott Gray', 'Benjamin Chess', 'Jack Clark', 'Christopher Berner', 'Sam McCandlish', 'Alec Radford', 'Ilya Sutskever', 'Dario Amodei'], 'externalIds': {'DBLP': 'journals/corr/abs-2005-14165', 'ArXiv': '2005.14165', 'MAG': '3030163527', 'CorpusId': 218971783}}
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 2/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** e5f116b6-5fe9-4020-90c6-59ad6d28d0f1
**Similarity:** 0.8679221353278955
**Text:** Source 2:\n", + "Our work focuses on the first approach (scaling compute and parameters together,\n", + "by stra...
**Metadata:** {'title': 'Language Models are Few-Shot Learners', 'venue': 'Neural Information Processing Systems', 'year': 2020, 'paperId': '6b85b63579a916f705a8e10a49bd8d849d91b1fc', 'citationCount': 14032, 'openAccessPdf': None, 'authors': ['Tom B. Brown', 'Benjamin Mann', 'Nick Ryder', 'Melanie Subbiah', 'J. Kaplan', 'Prafulla Dhariwal', 'Arvind Neelakantan', 'Pranav Shyam', 'Girish Sastry', 'Amanda Askell', 'Sandhini Agarwal', 'Ariel Herbert-Voss', 'Gretchen Krueger', 'T. Henighan', 'Rewon Child', 'A. Ramesh', 'Daniel M. Ziegler', 'Jeff Wu', 'Clemens Winter', 'Christopher Hesse', 'Mark Chen', 'Eric Sigler', 'Mateusz Litwin', 'Scott Gray', 'Benjamin Chess', 'Jack Clark', 'Christopher Berner', 'Sam McCandlish', 'Alec Radford', 'Ilya Sutskever', 'Dario Amodei'], 'externalIds': {'DBLP': 'journals/corr/abs-2005-14165', 'ArXiv': '2005.14165', 'MAG': '3030163527', 'CorpusId': 218971783}}
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 3/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 3bd5a072-739a-4d4d-bd2d-30b8a50e26f1
**Similarity:** 0.864251829100195
**Text:** Source 3:\n", + "small accu-\n", + "racy gains.An alternative idea would be to increase the\n", + "diversity of the pr...
**Metadata:** {'title': 'Large Language Models Struggle to Learn Long-Tail Knowledge', 'venue': 'International Conference on Machine Learning', 'year': 2022, 'paperId': '75f7e9e2b59fb640ef9d1dff94097175daf46c4d', 'citationCount': 35, 'openAccessPdf': None, 'authors': ['Nikhil Kandpal', 'H. Deng', 'Adam Roberts', 'Eric Wallace', 'Colin Raffel'], 'externalIds': {'DBLP': 'journals/corr/abs-2211-08411', 'ArXiv': '2211.08411', 'DOI': '10.48550/arXiv.2211.08411', 'CorpusId': 253522998}}
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 4/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 8ee81016-da93-4d9a-b038-f47020260f85
**Similarity:** 0.864251829100195
**Text:** Source 4:\n", + "All of the LMs that we study do limited epochs,\n", + "as it is generally seen as preferable t...
**Metadata:** {'title': 'Large Language Models Struggle to Learn Long-Tail Knowledge', 'venue': 'International Conference on Machine Learning', 'year': 2022, 'paperId': '75f7e9e2b59fb640ef9d1dff94097175daf46c4d', 'citationCount': 35, 'openAccessPdf': None, 'authors': ['Nikhil Kandpal', 'H. Deng', 'Adam Roberts', 'Eric Wallace', 'Colin Raffel'], 'externalIds': {'DBLP': 'journals/corr/abs-2211-08411', 'ArXiv': '2211.08411', 'DOI': '10.48550/arXiv.2211.08411', 'CorpusId': 253522998}}
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 5/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 17bbc1ce-f975-4ef2-a0c5-9110ade5225f
**Similarity:** 0.8627260872607259
**Text:** Source 5:\n", + "Well, this is fine\n", + "as long as there is no possibility of anyone as-\n", + "signing more weight ...
**Metadata:** {'title': 'Talking About Large Language Models', 'venue': 'arXiv.org', 'year': 2022, 'paperId': '3eed4de25636ac90f39f6e1ef70e3507ed61a2a6', 'citationCount': 43, 'openAccessPdf': None, 'authors': ['M. Shanahan'], 'externalIds': {'ArXiv': '2212.03551', 'DBLP': 'journals/corr/abs-2212-03551', 'DOI': '10.48550/arXiv.2212.03551', 'CorpusId': 254366666}}
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 6/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 48150b03-0525-498f-8601-c4698954a7db
**Similarity:** 0.8627260872607259
**Text:** Source 6:\n", + "Acknowledgments\n", + "Thanks to Toni Creswell, Richard Evans, Chris-\n", + "tos Kaplanis, Andrew Lam...
**Metadata:** {'title': 'Talking About Large Language Models', 'venue': 'arXiv.org', 'year': 2022, 'paperId': '3eed4de25636ac90f39f6e1ef70e3507ed61a2a6', 'citationCount': 43, 'openAccessPdf': None, 'authors': ['M. Shanahan'], 'externalIds': {'ArXiv': '2212.03551', 'DBLP': 'journals/corr/abs-2212-03551', 'DOI': '10.48550/arXiv.2212.03551', 'CorpusId': 254366666}}
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "query_space = \"large language models\"\n", + "full_text = True\n", + "# be careful with the total_papers when full_text = True\n", + "# it can take a long time to download\n", + "total_papers = 50\n", + "\n", + "persist_dir = (\n", + " \"./citation_\" + query_space + \"_\" + str(total_papers) + \"_\" + str(full_text)\n", + ")\n", + "\n", + "if not os.path.exists(persist_dir):\n", + " # Load data from Semantic Scholar\n", + " documents = s2reader.load_data(query_space, total_papers, full_text=full_text)\n", + " index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n", + " index.storage_context.persist(persist_dir=persist_dir)\n", + "else:\n", + " index = load_index_from_storage(\n", + " StorageContext.from_defaults(persist_dir=persist_dir),\n", + " service_context=service_context,\n", + " )\n", + " \n", + "# initialize the citation query engine\n", + "query_engine = CitationQueryEngine.from_args(\n", + " index,\n", + " similarity_top_k=3,\n", + " citation_chunk_size=512,\n", + ")\n", + "\n", + "query_string = \"limitations of using large language models\"\n", + "\n", + "# query the citation query engine\n", + "response = query_engine.query(query_string)\n", + "display_response(response, show_source=True, source_length=100, show_source_metadata=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "**`Final Response:`** The efficacy numbers of the COVID-19 vaccines are as follows:\n", + "\n", + "- NVX-CoV2373: 49% efficacy against the B.1.351 variant, increasing to 60% when excluding HIV-positive individuals [1].\n", + "- Ad26.COV2-S: 72% efficacy against PCR-confirmed infection in the USA, reduced to 66% efficacy in Latin America and 57% efficacy in South Africa [1].\n", + "- AZD1222: Did not demonstrate protection against mild to moderate B.1.351-induced COVID-19 [1].\n", + "- BNT162b2: Elicited antibodies with neutralizing activity against B.1.1.7 and P.1 variants [1].\n", + "- CoronaVac: 50% efficacy against symptomatic infection [1].\n", + "- Sinopharm (BBIBP-CorV): 78% efficacy against COVID-19 [5].\n", + "- Novavax (NVX-CoV2373): 89% efficacy against symptomatic COVID-19 [5].\n", + "- VECTOR (EpiVacCorona): No data available [5].\n", + "\n", + "Note: These efficacy numbers are based on the provided sources and may not represent the most up-to-date information." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 1/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 4e5f66ec-7455-436c-ad7b-db666bfa0f1f
**Similarity:** 0.8624234672546093
**Text:** Source 1:\n", + "NVX-CoV2373 \n", + "showed an efficacy of 49% against the \n", + "B.1.351 variant in the prevention o...
**Metadata:** {'title': 'Progress of the COVID-19 vaccine effort: viruses, vaccines and variants versus efficacy, effectiveness and escape', 'venue': 'Nature reviews. Immunology', 'year': 2021, 'paperId': 'b8b7b90263b1168d9466feb99ce2ce1efa7514b3', 'citationCount': 603, 'openAccessPdf': 'https://www.nature.com/articles/s41577-021-00592-1.pdf', 'authors': ['J. Tregoning', 'Katie E. Flight', 'Sophie L. Higham', 'Ziyin Wang', 'B. F. Pierce'], 'externalIds': {'PubMedCentral': '8351583', 'DOI': '10.1038/s41577-021-00592-1', 'CorpusId': 236968006, 'PubMed': '34373623'}}
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 2/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** f0144cc7-f35b-49c7-aca4-030bed69ee1e
**Similarity:** 0.8624234672546093
**Text:** Source 2:\n", + "617.2) variant.A significant \n", + "decrease in neutralizing antibody titre \n", + "has been seen fo...
**Metadata:** {'title': 'Progress of the COVID-19 vaccine effort: viruses, vaccines and variants versus efficacy, effectiveness and escape', 'venue': 'Nature reviews. Immunology', 'year': 2021, 'paperId': 'b8b7b90263b1168d9466feb99ce2ce1efa7514b3', 'citationCount': 603, 'openAccessPdf': 'https://www.nature.com/articles/s41577-021-00592-1.pdf', 'authors': ['J. Tregoning', 'Katie E. Flight', 'Sophie L. Higham', 'Ziyin Wang', 'B. F. Pierce'], 'externalIds': {'PubMedCentral': '8351583', 'DOI': '10.1038/s41577-021-00592-1', 'CorpusId': 236968006, 'PubMed': '34373623'}}
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 3/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** ddceeaeb-a91f-403b-a4ed-8f7cc8307ccb
**Similarity:** 0.8616244247348551
**Text:** Source 3:\n", + "The only valid way to compare vaccines directly is \n", + "in head-to-head efficacy trials, wh...
**Metadata:** {'title': 'Progress of the COVID-19 vaccine effort: viruses, vaccines and variants versus efficacy, effectiveness and escape', 'venue': 'Nature reviews. Immunology', 'year': 2021, 'paperId': 'b8b7b90263b1168d9466feb99ce2ce1efa7514b3', 'citationCount': 603, 'openAccessPdf': 'https://www.nature.com/articles/s41577-021-00592-1.pdf', 'authors': ['J. Tregoning', 'Katie E. Flight', 'Sophie L. Higham', 'Ziyin Wang', 'B. F. Pierce'], 'externalIds': {'PubMedCentral': '8351583', 'DOI': '10.1038/s41577-021-00592-1', 'CorpusId': 236968006, 'PubMed': '34373623'}}
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 4/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 0976ea2c-4d05-4201-8ca1-a585053b6756
**Similarity:** 0.8616244247348551
**Text:** Source 4:\n", + "population studied and prevalence \n", + "of SARS-CoV-2 variants at the time of the \n", + "trial, it...
**Metadata:** {'title': 'Progress of the COVID-19 vaccine effort: viruses, vaccines and variants versus efficacy, effectiveness and escape', 'venue': 'Nature reviews. Immunology', 'year': 2021, 'paperId': 'b8b7b90263b1168d9466feb99ce2ce1efa7514b3', 'citationCount': 603, 'openAccessPdf': 'https://www.nature.com/articles/s41577-021-00592-1.pdf', 'authors': ['J. Tregoning', 'Katie E. Flight', 'Sophie L. Higham', 'Ziyin Wang', 'B. F. Pierce'], 'externalIds': {'PubMedCentral': '8351583', 'DOI': '10.1038/s41577-021-00592-1', 'CorpusId': 236968006, 'PubMed': '34373623'}}
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 5/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 21168cc5-8587-4a61-9c0c-4ebad48e2653
**Similarity:** 0.8593642969779912
**Text:** Source 5:\n", + "Although differences in how the \n", + "clinical trials were set up make comparison \n", + "between v...
**Metadata:** {'title': 'Progress of the COVID-19 vaccine effort: viruses, vaccines and variants versus efficacy, effectiveness and escape', 'venue': 'Nature reviews. Immunology', 'year': 2021, 'paperId': 'b8b7b90263b1168d9466feb99ce2ce1efa7514b3', 'citationCount': 603, 'openAccessPdf': 'https://www.nature.com/articles/s41577-021-00592-1.pdf', 'authors': ['J. Tregoning', 'Katie E. Flight', 'Sophie L. Higham', 'Ziyin Wang', 'B. F. Pierce'], 'externalIds': {'PubMedCentral': '8351583', 'DOI': '10.1038/s41577-021-00592-1', 'CorpusId': 236968006, 'PubMed': '34373623'}}
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 6/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 344121c1-d02a-4149-87c1-4c682d5d7dc1
**Similarity:** 0.8593642969779912
**Text:** Source 6:\n", + "laboratory \n", + "confirmed COVID-19 \n", + "within \n", + "6 months after first dose≥18 years old 9 months...
**Metadata:** {'title': 'Progress of the COVID-19 vaccine effort: viruses, vaccines and variants versus efficacy, effectiveness and escape', 'venue': 'Nature reviews. Immunology', 'year': 2021, 'paperId': 'b8b7b90263b1168d9466feb99ce2ce1efa7514b3', 'citationCount': 603, 'openAccessPdf': 'https://www.nature.com/articles/s41577-021-00592-1.pdf', 'authors': ['J. Tregoning', 'Katie E. Flight', 'Sophie L. Higham', 'Ziyin Wang', 'B. F. Pierce'], 'externalIds': {'PubMedCentral': '8351583', 'DOI': '10.1038/s41577-021-00592-1', 'CorpusId': 236968006, 'PubMed': '34373623'}}
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "query_space = \"covid 19 vaccine\"\n", + "query_string = \"List the efficacy numbers of the covid 19 vaccines\"\n", + "full_text = True\n", + "# be careful with the total_papers when full_text = True\n", + "# it can take a long time to download\n", + "total_papers = 50\n", + "\n", + "persist_dir = (\n", + " \"./citation_\" + query_space + \"_\" + str(total_papers) + \"_\" + str(full_text)\n", + ")\n", + "\n", + "if not os.path.exists(persist_dir):\n", + " # Load data from Semantic Scholar\n", + " documents = s2reader.load_data(query_space, total_papers, full_text=full_text)\n", + " index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n", + " index.storage_context.persist(persist_dir=persist_dir)\n", + "else:\n", + " index = load_index_from_storage(\n", + " StorageContext.from_defaults(persist_dir=persist_dir),\n", + " service_context=service_context,\n", + " )\n", + " \n", + "# initialize the citation query engine\n", + "query_engine = CitationQueryEngine.from_args(\n", + " index,\n", + " similarity_top_k=3,\n", + " citation_chunk_size=512,\n", + ")\n", + "\n", + "# query the citation query engine\n", + "response = query_engine.query(query_string)\n", + "display_response(response, show_source=True, source_length=100, show_source_metadata=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/llama_hub/semanticscholar/requirements.txt b/llama_hub/semanticscholar/requirements.txt index 8095340187..f5c4d1fcb5 100644 --- a/llama_hub/semanticscholar/requirements.txt +++ b/llama_hub/semanticscholar/requirements.txt @@ -1 +1,3 @@ -semanticscholar==0.4.1 \ No newline at end of file +semanticscholar==0.4.1 +arxiv==1.4.8 +PyPDF2==3.0.1 \ No newline at end of file diff --git a/llama_hub/semanticscholar/test.py b/llama_hub/semanticscholar/test.py index 6c28420ad6..ea54772ce3 100644 --- a/llama_hub/semanticscholar/test.py +++ b/llama_hub/semanticscholar/test.py @@ -20,11 +20,21 @@ ) query_space = "large language models" -persist_dir = "./citation_" + query_space +query_string = "limitations of using large language models" +full_text = True +# be careful with the total_papers when full_text = True +# it can take a long time to download +total_papers = 50 + +persist_dir = ( + "./citation_" + query_space + "_" + str(total_papers) + "_" + str(full_text) +) + + if not os.path.exists(persist_dir): # Load data from Semantic Scholar - documents = s2reader.load_data(query=query_space, limit=10) + documents = s2reader.load_data(query_space, total_papers, full_text=full_text) index = VectorStoreIndex.from_documents(documents, service_context=service_context) index.storage_context.persist(persist_dir=persist_dir) else: @@ -40,7 +50,7 @@ ) # query the citation query engine -response = query_engine.query("limitations of using large language models") +response = query_engine.query(query_string) print("Answer: ", response) print("Source nodes: ") for node in response.source_nodes: