From c84e9d4a72da9ab9cb43672fae7cf4ba9c151cdd Mon Sep 17 00:00:00 2001 From: shauryr Date: Thu, 24 Aug 2023 19:57:40 -0500 Subject: [PATCH 1/8] added arxiv and pypdf --- llama_hub/semanticscholar/requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_hub/semanticscholar/requirements.txt b/llama_hub/semanticscholar/requirements.txt index 8095340187..f5c4d1fcb5 100644 --- a/llama_hub/semanticscholar/requirements.txt +++ b/llama_hub/semanticscholar/requirements.txt @@ -1 +1,3 @@ -semanticscholar==0.4.1 \ No newline at end of file +semanticscholar==0.4.1 +arxiv==1.4.8 +PyPDF2==3.0.1 \ No newline at end of file From 392579ff062d727cce5f14e8890c14ff7859cd2c Mon Sep 17 00:00:00 2001 From: shauryr Date: Thu, 24 Aug 2023 20:00:30 -0500 Subject: [PATCH 2/8] added full text support --- llama_hub/semanticscholar/base.py | 124 ++++++++++++++++++++++++++++-- 1 file changed, 119 insertions(+), 5 deletions(-) diff --git a/llama_hub/semanticscholar/base.py b/llama_hub/semanticscholar/base.py index 1fbd131bfb..d75c56fdb7 100644 --- a/llama_hub/semanticscholar/base.py +++ b/llama_hub/semanticscholar/base.py @@ -3,6 +3,8 @@ from llama_index.readers.schema.base import Document import requests from typing import List +import os + class SemanticScholarReader(BaseReader): @@ -19,19 +21,125 @@ class SemanticScholarReader(BaseReader): Loads data from Semantic Scholar based on the query and returned_fields """ + - def __init__(self): + def __init__(self, timeout=10, api_key=None, base_dir="pdfs"): """ Instantiate the SemanticScholar object """ from semanticscholar import SemanticScholar + import arxiv + + + self.arxiv = arxiv + self.base_dir = base_dir + self.s2 = SemanticScholar(timeout, api_key) + # check for base dir + if not os.path.exists(self.base_dir): + os.makedirs(self.base_dir) + + def _clear_cache(self): + """ + delete the .citation* folder + """ + import shutil + + shutil.rmtree("./.citation*") + + def _download_pdf(self, paper_id, url: str, base_dir="pdfs"): + logger = logging.getLogger() + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" + } + # Making a GET request + response = requests.get(url, headers=headers, stream=True) + content_type = response.headers["Content-Type"] + + # As long as the content-type is application/pdf, this will download the file + if "application/pdf" in content_type: + os.makedirs(base_dir, exist_ok=True) + file_path = os.path.join(base_dir, f"{paper_id}.pdf") + # check if the file already exists + if os.path.exists(file_path): + logger.info(f"{file_path} already exists") + return file_path + with open(file_path, "wb") as file: + for chunk in response.iter_content(chunk_size=1024): + if chunk: + file.write(chunk) + logger.info(f"Downloaded pdf from {url}") + return file_path + else: + logger.warning(f"{url} did not point to a pdf file") + return None + + def _get_full_text_docs(self, documents: List[Document]) -> List[Document]: + from PyPDF2 import PdfReader - self.s2 = SemanticScholar() + """ + Gets the full text of the documents from Semantic Scholar + + Parameters + ---------- + documents: list + The list of Document object that contains the search results + + Returns + ------- + list + The list of Document object that contains the search results with full text + + Raises + ------ + Exception + If there is an error while getting the full text + + """ + full_text_docs = [] + for paper in documents: + metadata = paper.extra_info + url = metadata["openAccessPdf"] + externalIds = metadata["externalIds"] + paper_id = metadata["paperId"] + file_path = None + persist_dir = os.path.join(self.base_dir, f"{paper_id}.pdf") + if url and not os.path.exists(persist_dir): + # Download the document first + file_path = self._download_pdf(metadata["paperId"], url, persist_dir) + + if not url and externalIds and "ArXiv" in externalIds and not os.path.exists(persist_dir): + # download the pdf from arxiv + file_path = self._download_pdf_from_arxiv( + paper_id, externalIds["ArXiv"] + ) + + # Then, check if it's a valid PDF. If it's not, skip to the next document. + if file_path: + try: + pdf = PdfReader(open(file_path, "rb")) + except Exception as e: + logging.error( + f"Failed to read pdf with exception: {e}. Skipping document..." + ) + continue + + text = "" + for page in pdf.pages: + text += page.extract_text() + full_text_docs.append(Document(text=text, extra_info=metadata)) + + return full_text_docs + + def _download_pdf_from_arxiv(self, paper_id, arxiv_id): + paper = next(self.arxiv.Search(id_list=[arxiv_id], max_results=1).results()) + paper.download_pdf(dirpath=self.base_dir, filename=paper_id + ".pdf") + return os.path.join(self.base_dir, f"{paper_id}.pdf") def load_data( self, query, - limit=10, + limit, + full_text=False, returned_fields=[ "title", "abstract", @@ -41,6 +149,7 @@ def load_data( "citationCount", "openAccessPdf", "authors", + "externalIds", ], ) -> List[Document]: """ @@ -80,7 +189,7 @@ def load_data( documents = [] for item in results[:limit]: - openaccesspdf = getattr(item, "openAccessPdf", None) + openAccessPdf = getattr(item, "openAccessPdf", None) abstract = getattr(item, "abstract", None) title = getattr(item, "title", None) text = None @@ -96,9 +205,14 @@ def load_data( "year": getattr(item, "year", None), "paperId": getattr(item, "paperId", None), "citationCount": getattr(item, "citationCount", None), - "openAccessPdf": openaccesspdf.get("url") if openaccesspdf else None, + "openAccessPdf": openAccessPdf.get("url") if openAccessPdf else None, "authors": [author["name"] for author in getattr(item, "authors", [])], + "externalIds": getattr(item, "externalIds", None), } documents.append(Document(text=text, extra_info=metadata)) + + if full_text: + full_text_documents = self._get_full_text_docs(documents) + documents.extend(full_text_documents) return documents From 5532e9337c897ffdd5320cf8da2eb7bc50e3caaa Mon Sep 17 00:00:00 2001 From: shauryr Date: Thu, 24 Aug 2023 20:00:52 -0500 Subject: [PATCH 3/8] added demo with full text --- llama_hub/semanticscholar/test.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/llama_hub/semanticscholar/test.py b/llama_hub/semanticscholar/test.py index 6c28420ad6..ea54772ce3 100644 --- a/llama_hub/semanticscholar/test.py +++ b/llama_hub/semanticscholar/test.py @@ -20,11 +20,21 @@ ) query_space = "large language models" -persist_dir = "./citation_" + query_space +query_string = "limitations of using large language models" +full_text = True +# be careful with the total_papers when full_text = True +# it can take a long time to download +total_papers = 50 + +persist_dir = ( + "./citation_" + query_space + "_" + str(total_papers) + "_" + str(full_text) +) + + if not os.path.exists(persist_dir): # Load data from Semantic Scholar - documents = s2reader.load_data(query=query_space, limit=10) + documents = s2reader.load_data(query_space, total_papers, full_text=full_text) index = VectorStoreIndex.from_documents(documents, service_context=service_context) index.storage_context.persist(persist_dir=persist_dir) else: @@ -40,7 +50,7 @@ ) # query the citation query engine -response = query_engine.query("limitations of using large language models") +response = query_engine.query(query_string) print("Answer: ", response) print("Source nodes: ") for node in response.source_nodes: From ff5578f96c19a989fffc42f622df1a5bc8c47001 Mon Sep 17 00:00:00 2001 From: shauryr Date: Thu, 24 Aug 2023 20:26:55 -0500 Subject: [PATCH 4/8] changes to warning msg --- llama_hub/semanticscholar/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_hub/semanticscholar/base.py b/llama_hub/semanticscholar/base.py index d75c56fdb7..1bf5658c59 100644 --- a/llama_hub/semanticscholar/base.py +++ b/llama_hub/semanticscholar/base.py @@ -70,7 +70,7 @@ def _download_pdf(self, paper_id, url: str, base_dir="pdfs"): logger.info(f"Downloaded pdf from {url}") return file_path else: - logger.warning(f"{url} did not point to a pdf file") + logger.warning(f"{url} was not downloaded: protected") return None def _get_full_text_docs(self, documents: List[Document]) -> List[Document]: From 9883b8c40a5af7e9a11c486d6c22d8266ea631d4 Mon Sep 17 00:00:00 2001 From: shauryr Date: Thu, 24 Aug 2023 20:27:05 -0500 Subject: [PATCH 5/8] demo notebook --- llama_hub/semanticscholar/demo_s2.ipynb | 655 ++++++++++++++++++++++++ 1 file changed, 655 insertions(+) create mode 100644 llama_hub/semanticscholar/demo_s2.ipynb diff --git a/llama_hub/semanticscholar/demo_s2.ipynb b/llama_hub/semanticscholar/demo_s2.ipynb new file mode 100644 index 0000000000..74d0b4ba3b --- /dev/null +++ b/llama_hub/semanticscholar/demo_s2.ipynb @@ -0,0 +1,655 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Demo: Semantic Scholar Loader in llama-index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Some preliminaries -\n", + "\n", + "- `query_space` : broad area of research\n", + "- `query_string` : a specific question to the documents in the query space\n", + "\n", + "\n", + "To download the open access pdfs and extract text from them, simply mark the `full_text` flag as `True` -\n", + "\n", + "\n", + "```python\n", + "s2reader = SemanticScholarReader()\n", + "documents = s2reader.load_data(query_space, total_papers, full_text=True)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_hub.semanticscholar.base import SemanticScholarReader\n", + "import os\n", + "import openai\n", + "from llama_index.llms import OpenAI\n", + "from llama_index.query_engine import CitationQueryEngine\n", + "from llama_index import (\n", + " VectorStoreIndex,\n", + " StorageContext,\n", + " load_index_from_storage,\n", + " ServiceContext,\n", + ")\n", + "from llama_index.response.notebook_utils import display_response\n", + "\n", + "# initialize the SemanticScholarReader\n", + "s2reader = SemanticScholarReader()\n", + "\n", + "# initialize the service context\n", + "openai.api_key = os.environ[\"OPENAI_API_KEY\"]\n", + "service_context = ServiceContext.from_defaults(\n", + " llm=OpenAI(model=\"gpt-3.5-turbo\", temperature=0)\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "**`Final Response:`** Large language models have limitations in terms of their training cost and computational resources [1]. While they can be efficient once trained, generating content from a trained model can still consume significant resources [1]. Techniques like model distillation can help reduce the cost of these models [1]. Additionally, increasing the size of language models may not necessarily improve their performance on long-tail knowledge or rare instances [3]. Scaling up models alone may not be sufficient to achieve high accuracy on specific types of questions [3]. There is also a need to modify the training objective or increase the number of training epochs to encourage memorization and focus on salient facts [4]. It is important to be cautious in how we talk about large language models, avoiding anthropomorphism and recognizing their limitations [5]." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 1/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** ce86c15c-97b2-462b-97a8-01d4f9b5cdca
**Similarity:** 0.8679221353278955
**Text:** ...
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 2/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 928d36ca-bf21-47f1-820b-b57a7fa30354
**Similarity:** 0.8679221353278955
**Text:** ...
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 3/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 6d0eba26-64a5-4b84-a71a-1bc412323761
**Similarity:** 0.864251829100195
**Text:** ...
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 4/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 509f3675-6048-4a14-8ba2-874316078ebf
**Similarity:** 0.864251829100195
**Text:** ...
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 5/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 8de0d9da-9729-486f-8490-434caf207934
**Similarity:** 0.8627260872607259
**Text:** ...
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 6/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 45b8d27e-c739-4493-acec-9554ea1ed24c
**Similarity:** 0.8627260872607259
**Text:** ...
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "query_space = \"large language models\"\n", + "query_string = \"limitations of using large language models\"\n", + "full_text = True\n", + "# be careful with the total_papers when full_text = True\n", + "# it can take a long time to download\n", + "total_papers = 50\n", + "\n", + "persist_dir = (\n", + " \"./citation_\" + query_space + \"_\" + str(total_papers) + \"_\" + str(full_text)\n", + ")\n", + "\n", + "if not os.path.exists(persist_dir):\n", + " # Load data from Semantic Scholar\n", + " documents = s2reader.load_data(query_space, total_papers, full_text=full_text)\n", + " index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n", + " index.storage_context.persist(persist_dir=persist_dir)\n", + "else:\n", + " index = load_index_from_storage(\n", + " StorageContext.from_defaults(persist_dir=persist_dir),\n", + " service_context=service_context,\n", + " )\n", + " \n", + "# initialize the citation query engine\n", + "query_engine = CitationQueryEngine.from_args(\n", + " index,\n", + " similarity_top_k=3,\n", + " citation_chunk_size=512,\n", + ")\n", + "\n", + "# query the citation query engine\n", + "response = query_engine.query(query_string)\n", + "display_response(response, show_source=True, source_length=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "**`Final Response:`** The efficacy numbers of the COVID-19 vaccines are as follows:\n", + "\n", + "- NVX-CoV2373: 49% efficacy against the B.1.351 variant, increasing to 60% when excluding HIV-positive individuals [1].\n", + "- Ad26.COV2-S: 72% efficacy against PCR-confirmed infection in the USA, reduced to 66% efficacy in Latin America and 57% efficacy in South Africa [1].\n", + "- AZD1222: Did not demonstrate protection against mild to moderate B.1.351-induced COVID-19 [1].\n", + "- BNT162b2: Elicited antibodies with neutralizing activity against B.1.1.7 and P.1 variants [1].\n", + "- CoronaVac: 50% efficacy against symptomatic infection [1].\n", + "- Sinopharm (BBIBP-CorV): 78% efficacy against COVID-19 and 79% efficacy against hospitalization [5].\n", + "- Novavax (NVX-CoV2373): 89% efficacy against symptomatic COVID-19 and positive RT-PCR test result [5].\n", + "- VECTOR (EpiVacCorona): No data available [5].\n", + "\n", + "Note: These efficacy numbers are based on the provided sources and may not represent the most up-to-date information." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 1/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** b6663a8b-5679-4723-9cc9-925ce5b84c34
**Similarity:** 0.8624234672546093
**Text:** Source 1:\n", + "NVX-CoV2373 \n", + "showed an efficacy of 49% against the \n", + "B.1.351 variant in the prevention o...
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 2/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 67d4659d-b164-4841-9176-63e49f837b9c
**Similarity:** 0.8624234672546093
**Text:** Source 2:\n", + "617.2) variant.A significant \n", + "decrease in neutralizing antibody titre \n", + "has been seen fo...
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 3/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 98ff1d6f-1992-4674-b396-12f58acb5cd7
**Similarity:** 0.8616244247348551
**Text:** Source 3:\n", + "The only valid way to compare vaccines directly is \n", + "in head-to-head efficacy trials, wh...
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 4/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 6752826d-3c31-48de-ab75-c625c06718c1
**Similarity:** 0.8616244247348551
**Text:** Source 4:\n", + "population studied and prevalence \n", + "of SARS-CoV-2 variants at the time of the \n", + "trial, it...
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 5/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 0283e2f1-1eb7-4d73-ba92-1724487b5aee
**Similarity:** 0.8593642969779912
**Text:** Source 5:\n", + "Although differences in how the \n", + "clinical trials were set up make comparison \n", + "between v...
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 6/6`**" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Node ID:** 57356c41-b2bf-429b-a842-bc9962a5f007
**Similarity:** 0.8593642969779912
**Text:** Source 6:\n", + "laboratory \n", + "confirmed COVID-19 \n", + "within \n", + "6 months after first dose≥18 years old 9 months...
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "query_space = \"covid 19 vaccine\"\n", + "query_string = \"List the efficacy numbers of the covid 19 vaccines\"\n", + "full_text = True\n", + "# be careful with the total_papers when full_text = True\n", + "# it can take a long time to download\n", + "total_papers = 50\n", + "\n", + "persist_dir = (\n", + " \"./citation_\" + query_space + \"_\" + str(total_papers) + \"_\" + str(full_text)\n", + ")\n", + "\n", + "if not os.path.exists(persist_dir):\n", + " # Load data from Semantic Scholar\n", + " documents = s2reader.load_data(query_space, total_papers, full_text=full_text)\n", + " index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n", + " index.storage_context.persist(persist_dir=persist_dir)\n", + "else:\n", + " index = load_index_from_storage(\n", + " StorageContext.from_defaults(persist_dir=persist_dir),\n", + " service_context=service_context,\n", + " )\n", + " \n", + "# initialize the citation query engine\n", + "query_engine = CitationQueryEngine.from_args(\n", + " index,\n", + " similarity_top_k=3,\n", + " citation_chunk_size=512,\n", + ")\n", + "\n", + "# query the citation query engine\n", + "response = query_engine.query(query_string)\n", + "display_response(response, show_source=True, source_length=100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 857a8230b97a060a25801f94102c2bb6f4df2962 Mon Sep 17 00:00:00 2001 From: shauryr Date: Thu, 24 Aug 2023 20:37:50 -0500 Subject: [PATCH 6/8] fixes notebook --- llama_hub/semanticscholar/demo_s2.ipynb | 83 ++++++++++++------------- 1 file changed, 39 insertions(+), 44 deletions(-) diff --git a/llama_hub/semanticscholar/demo_s2.ipynb b/llama_hub/semanticscholar/demo_s2.ipynb index 74d0b4ba3b..1bea569e50 100644 --- a/llama_hub/semanticscholar/demo_s2.ipynb +++ b/llama_hub/semanticscholar/demo_s2.ipynb @@ -4,26 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Demo: Semantic Scholar Loader in llama-index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Some preliminaries -\n", - "\n", - "- `query_space` : broad area of research\n", - "- `query_string` : a specific question to the documents in the query space\n", - "\n", - "\n", - "To download the open access pdfs and extract text from them, simply mark the `full_text` flag as `True` -\n", - "\n", - "\n", - "```python\n", - "s2reader = SemanticScholarReader()\n", - "documents = s2reader.load_data(query_space, total_papers, full_text=True)\n", - "```" + "# Semantic Scholar Loader in llama-index" ] }, { @@ -57,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -99,7 +80,8 @@ { "data": { "text/markdown": [ - "**Node ID:** ce86c15c-97b2-462b-97a8-01d4f9b5cdca
**Similarity:** 0.8679221353278955
**Text:** ...
" + "**Node ID:** 35028af6-85ea-4f55-a5b4-bfe11778cbb2
**Similarity:** 0.8679221353278955
**Text:** Source 1:\n", + "consume significant resources during training, they can be surprisingly efficient once tr...
**Metadata:** {'title': 'Language Models are Few-Shot Learners', 'venue': 'Neural Information Processing Systems', 'year': 2020, 'paperId': '6b85b63579a916f705a8e10a49bd8d849d91b1fc', 'citationCount': 14032, 'openAccessPdf': None, 'authors': ['Tom B. Brown', 'Benjamin Mann', 'Nick Ryder', 'Melanie Subbiah', 'J. Kaplan', 'Prafulla Dhariwal', 'Arvind Neelakantan', 'Pranav Shyam', 'Girish Sastry', 'Amanda Askell', 'Sandhini Agarwal', 'Ariel Herbert-Voss', 'Gretchen Krueger', 'T. Henighan', 'Rewon Child', 'A. Ramesh', 'Daniel M. Ziegler', 'Jeff Wu', 'Clemens Winter', 'Christopher Hesse', 'Mark Chen', 'Eric Sigler', 'Mateusz Litwin', 'Scott Gray', 'Benjamin Chess', 'Jack Clark', 'Christopher Berner', 'Sam McCandlish', 'Alec Radford', 'Ilya Sutskever', 'Dario Amodei'], 'externalIds': {'DBLP': 'journals/corr/abs-2005-14165', 'ArXiv': '2005.14165', 'MAG': '3030163527', 'CorpusId': 218971783}}
" ], "text/plain": [ "" @@ -135,7 +117,9 @@ { "data": { "text/markdown": [ - "**Node ID:** 928d36ca-bf21-47f1-820b-b57a7fa30354
**Similarity:** 0.8679221353278955
**Text:** ...
" + "**Node ID:** e5f116b6-5fe9-4020-90c6-59ad6d28d0f1
**Similarity:** 0.8679221353278955
**Text:** Source 2:\n", + "Our work focuses on the first approach (scaling compute and parameters together,\n", + "by stra...
**Metadata:** {'title': 'Language Models are Few-Shot Learners', 'venue': 'Neural Information Processing Systems', 'year': 2020, 'paperId': '6b85b63579a916f705a8e10a49bd8d849d91b1fc', 'citationCount': 14032, 'openAccessPdf': None, 'authors': ['Tom B. Brown', 'Benjamin Mann', 'Nick Ryder', 'Melanie Subbiah', 'J. Kaplan', 'Prafulla Dhariwal', 'Arvind Neelakantan', 'Pranav Shyam', 'Girish Sastry', 'Amanda Askell', 'Sandhini Agarwal', 'Ariel Herbert-Voss', 'Gretchen Krueger', 'T. Henighan', 'Rewon Child', 'A. Ramesh', 'Daniel M. Ziegler', 'Jeff Wu', 'Clemens Winter', 'Christopher Hesse', 'Mark Chen', 'Eric Sigler', 'Mateusz Litwin', 'Scott Gray', 'Benjamin Chess', 'Jack Clark', 'Christopher Berner', 'Sam McCandlish', 'Alec Radford', 'Ilya Sutskever', 'Dario Amodei'], 'externalIds': {'DBLP': 'journals/corr/abs-2005-14165', 'ArXiv': '2005.14165', 'MAG': '3030163527', 'CorpusId': 218971783}}
" ], "text/plain": [ "" @@ -171,7 +155,10 @@ { "data": { "text/markdown": [ - "**Node ID:** 6d0eba26-64a5-4b84-a71a-1bc412323761
**Similarity:** 0.864251829100195
**Text:** ...
" + "**Node ID:** 3bd5a072-739a-4d4d-bd2d-30b8a50e26f1
**Similarity:** 0.864251829100195
**Text:** Source 3:\n", + "small accu-\n", + "racy gains.An alternative idea would be to increase the\n", + "diversity of the pr...
**Metadata:** {'title': 'Large Language Models Struggle to Learn Long-Tail Knowledge', 'venue': 'International Conference on Machine Learning', 'year': 2022, 'paperId': '75f7e9e2b59fb640ef9d1dff94097175daf46c4d', 'citationCount': 35, 'openAccessPdf': None, 'authors': ['Nikhil Kandpal', 'H. Deng', 'Adam Roberts', 'Eric Wallace', 'Colin Raffel'], 'externalIds': {'DBLP': 'journals/corr/abs-2211-08411', 'ArXiv': '2211.08411', 'DOI': '10.48550/arXiv.2211.08411', 'CorpusId': 253522998}}
" ], "text/plain": [ "" @@ -207,7 +194,9 @@ { "data": { "text/markdown": [ - "**Node ID:** 509f3675-6048-4a14-8ba2-874316078ebf
**Similarity:** 0.864251829100195
**Text:** ...
" + "**Node ID:** 8ee81016-da93-4d9a-b038-f47020260f85
**Similarity:** 0.864251829100195
**Text:** Source 4:\n", + "All of the LMs that we study do limited epochs,\n", + "as it is generally seen as preferable t...
**Metadata:** {'title': 'Large Language Models Struggle to Learn Long-Tail Knowledge', 'venue': 'International Conference on Machine Learning', 'year': 2022, 'paperId': '75f7e9e2b59fb640ef9d1dff94097175daf46c4d', 'citationCount': 35, 'openAccessPdf': None, 'authors': ['Nikhil Kandpal', 'H. Deng', 'Adam Roberts', 'Eric Wallace', 'Colin Raffel'], 'externalIds': {'DBLP': 'journals/corr/abs-2211-08411', 'ArXiv': '2211.08411', 'DOI': '10.48550/arXiv.2211.08411', 'CorpusId': 253522998}}
" ], "text/plain": [ "" @@ -243,7 +232,10 @@ { "data": { "text/markdown": [ - "**Node ID:** 8de0d9da-9729-486f-8490-434caf207934
**Similarity:** 0.8627260872607259
**Text:** ...
" + "**Node ID:** 17bbc1ce-f975-4ef2-a0c5-9110ade5225f
**Similarity:** 0.8627260872607259
**Text:** Source 5:\n", + "Well, this is fine\n", + "as long as there is no possibility of anyone as-\n", + "signing more weight ...
**Metadata:** {'title': 'Talking About Large Language Models', 'venue': 'arXiv.org', 'year': 2022, 'paperId': '3eed4de25636ac90f39f6e1ef70e3507ed61a2a6', 'citationCount': 43, 'openAccessPdf': None, 'authors': ['M. Shanahan'], 'externalIds': {'ArXiv': '2212.03551', 'DBLP': 'journals/corr/abs-2212-03551', 'DOI': '10.48550/arXiv.2212.03551', 'CorpusId': 254366666}}
" ], "text/plain": [ "" @@ -279,7 +271,10 @@ { "data": { "text/markdown": [ - "**Node ID:** 45b8d27e-c739-4493-acec-9554ea1ed24c
**Similarity:** 0.8627260872607259
**Text:** ...
" + "**Node ID:** 48150b03-0525-498f-8601-c4698954a7db
**Similarity:** 0.8627260872607259
**Text:** Source 6:\n", + "Acknowledgments\n", + "Thanks to Toni Creswell, Richard Evans, Chris-\n", + "tos Kaplanis, Andrew Lam...
**Metadata:** {'title': 'Talking About Large Language Models', 'venue': 'arXiv.org', 'year': 2022, 'paperId': '3eed4de25636ac90f39f6e1ef70e3507ed61a2a6', 'citationCount': 43, 'openAccessPdf': None, 'authors': ['M. Shanahan'], 'externalIds': {'ArXiv': '2212.03551', 'DBLP': 'journals/corr/abs-2212-03551', 'DOI': '10.48550/arXiv.2212.03551', 'CorpusId': 254366666}}
" ], "text/plain": [ "" @@ -321,12 +316,12 @@ "\n", "# query the citation query engine\n", "response = query_engine.query(query_string)\n", - "display_response(response, show_source=True, source_length=3)" + "display_response(response, show_source=True, source_length=100, show_source_metadata=True)" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -339,8 +334,8 @@ "- AZD1222: Did not demonstrate protection against mild to moderate B.1.351-induced COVID-19 [1].\n", "- BNT162b2: Elicited antibodies with neutralizing activity against B.1.1.7 and P.1 variants [1].\n", "- CoronaVac: 50% efficacy against symptomatic infection [1].\n", - "- Sinopharm (BBIBP-CorV): 78% efficacy against COVID-19 and 79% efficacy against hospitalization [5].\n", - "- Novavax (NVX-CoV2373): 89% efficacy against symptomatic COVID-19 and positive RT-PCR test result [5].\n", + "- Sinopharm (BBIBP-CorV): 78% efficacy against COVID-19 [5].\n", + "- Novavax (NVX-CoV2373): 89% efficacy against symptomatic COVID-19 [5].\n", "- VECTOR (EpiVacCorona): No data available [5].\n", "\n", "Note: These efficacy numbers are based on the provided sources and may not represent the most up-to-date information." @@ -379,10 +374,10 @@ { "data": { "text/markdown": [ - "**Node ID:** b6663a8b-5679-4723-9cc9-925ce5b84c34
**Similarity:** 0.8624234672546093
**Text:** Source 1:\n", + "**Node ID:** 4e5f66ec-7455-436c-ad7b-db666bfa0f1f
**Similarity:** 0.8624234672546093
**Text:** Source 1:\n", "NVX-CoV2373 \n", "showed an efficacy of 49% against the \n", - "B.1.351 variant in the prevention o...
" + "B.1.351 variant in the prevention o...
**Metadata:** {'title': 'Progress of the COVID-19 vaccine effort: viruses, vaccines and variants versus efficacy, effectiveness and escape', 'venue': 'Nature reviews. Immunology', 'year': 2021, 'paperId': 'b8b7b90263b1168d9466feb99ce2ce1efa7514b3', 'citationCount': 603, 'openAccessPdf': 'https://www.nature.com/articles/s41577-021-00592-1.pdf', 'authors': ['J. Tregoning', 'Katie E. Flight', 'Sophie L. Higham', 'Ziyin Wang', 'B. F. Pierce'], 'externalIds': {'PubMedCentral': '8351583', 'DOI': '10.1038/s41577-021-00592-1', 'CorpusId': 236968006, 'PubMed': '34373623'}}
" ], "text/plain": [ "" @@ -418,10 +413,10 @@ { "data": { "text/markdown": [ - "**Node ID:** 67d4659d-b164-4841-9176-63e49f837b9c
**Similarity:** 0.8624234672546093
**Text:** Source 2:\n", + "**Node ID:** f0144cc7-f35b-49c7-aca4-030bed69ee1e
**Similarity:** 0.8624234672546093
**Text:** Source 2:\n", "617.2) variant.A significant \n", "decrease in neutralizing antibody titre \n", - "has been seen fo...
" + "has been seen fo...
**Metadata:** {'title': 'Progress of the COVID-19 vaccine effort: viruses, vaccines and variants versus efficacy, effectiveness and escape', 'venue': 'Nature reviews. Immunology', 'year': 2021, 'paperId': 'b8b7b90263b1168d9466feb99ce2ce1efa7514b3', 'citationCount': 603, 'openAccessPdf': 'https://www.nature.com/articles/s41577-021-00592-1.pdf', 'authors': ['J. Tregoning', 'Katie E. Flight', 'Sophie L. Higham', 'Ziyin Wang', 'B. F. Pierce'], 'externalIds': {'PubMedCentral': '8351583', 'DOI': '10.1038/s41577-021-00592-1', 'CorpusId': 236968006, 'PubMed': '34373623'}}
" ], "text/plain": [ "" @@ -457,9 +452,9 @@ { "data": { "text/markdown": [ - "**Node ID:** 98ff1d6f-1992-4674-b396-12f58acb5cd7
**Similarity:** 0.8616244247348551
**Text:** Source 3:\n", + "**Node ID:** ddceeaeb-a91f-403b-a4ed-8f7cc8307ccb
**Similarity:** 0.8616244247348551
**Text:** Source 3:\n", "The only valid way to compare vaccines directly is \n", - "in head-to-head efficacy trials, wh...
" + "in head-to-head efficacy trials, wh...
**Metadata:** {'title': 'Progress of the COVID-19 vaccine effort: viruses, vaccines and variants versus efficacy, effectiveness and escape', 'venue': 'Nature reviews. Immunology', 'year': 2021, 'paperId': 'b8b7b90263b1168d9466feb99ce2ce1efa7514b3', 'citationCount': 603, 'openAccessPdf': 'https://www.nature.com/articles/s41577-021-00592-1.pdf', 'authors': ['J. Tregoning', 'Katie E. Flight', 'Sophie L. Higham', 'Ziyin Wang', 'B. F. Pierce'], 'externalIds': {'PubMedCentral': '8351583', 'DOI': '10.1038/s41577-021-00592-1', 'CorpusId': 236968006, 'PubMed': '34373623'}}
" ], "text/plain": [ "" @@ -495,10 +490,10 @@ { "data": { "text/markdown": [ - "**Node ID:** 6752826d-3c31-48de-ab75-c625c06718c1
**Similarity:** 0.8616244247348551
**Text:** Source 4:\n", + "**Node ID:** 0976ea2c-4d05-4201-8ca1-a585053b6756
**Similarity:** 0.8616244247348551
**Text:** Source 4:\n", "population studied and prevalence \n", "of SARS-CoV-2 variants at the time of the \n", - "trial, it...
" + "trial, it...
**Metadata:** {'title': 'Progress of the COVID-19 vaccine effort: viruses, vaccines and variants versus efficacy, effectiveness and escape', 'venue': 'Nature reviews. Immunology', 'year': 2021, 'paperId': 'b8b7b90263b1168d9466feb99ce2ce1efa7514b3', 'citationCount': 603, 'openAccessPdf': 'https://www.nature.com/articles/s41577-021-00592-1.pdf', 'authors': ['J. Tregoning', 'Katie E. Flight', 'Sophie L. Higham', 'Ziyin Wang', 'B. F. Pierce'], 'externalIds': {'PubMedCentral': '8351583', 'DOI': '10.1038/s41577-021-00592-1', 'CorpusId': 236968006, 'PubMed': '34373623'}}
" ], "text/plain": [ "" @@ -534,10 +529,10 @@ { "data": { "text/markdown": [ - "**Node ID:** 0283e2f1-1eb7-4d73-ba92-1724487b5aee
**Similarity:** 0.8593642969779912
**Text:** Source 5:\n", + "**Node ID:** 21168cc5-8587-4a61-9c0c-4ebad48e2653
**Similarity:** 0.8593642969779912
**Text:** Source 5:\n", "Although differences in how the \n", "clinical trials were set up make comparison \n", - "between v...
" + "between v...
**Metadata:** {'title': 'Progress of the COVID-19 vaccine effort: viruses, vaccines and variants versus efficacy, effectiveness and escape', 'venue': 'Nature reviews. Immunology', 'year': 2021, 'paperId': 'b8b7b90263b1168d9466feb99ce2ce1efa7514b3', 'citationCount': 603, 'openAccessPdf': 'https://www.nature.com/articles/s41577-021-00592-1.pdf', 'authors': ['J. Tregoning', 'Katie E. Flight', 'Sophie L. Higham', 'Ziyin Wang', 'B. F. Pierce'], 'externalIds': {'PubMedCentral': '8351583', 'DOI': '10.1038/s41577-021-00592-1', 'CorpusId': 236968006, 'PubMed': '34373623'}}
" ], "text/plain": [ "" @@ -573,11 +568,11 @@ { "data": { "text/markdown": [ - "**Node ID:** 57356c41-b2bf-429b-a842-bc9962a5f007
**Similarity:** 0.8593642969779912
**Text:** Source 6:\n", + "**Node ID:** 344121c1-d02a-4149-87c1-4c682d5d7dc1
**Similarity:** 0.8593642969779912
**Text:** Source 6:\n", "laboratory \n", "confirmed COVID-19 \n", "within \n", - "6 months after first dose≥18 years old 9 months...
" + "6 months after first dose≥18 years old 9 months...
**Metadata:** {'title': 'Progress of the COVID-19 vaccine effort: viruses, vaccines and variants versus efficacy, effectiveness and escape', 'venue': 'Nature reviews. Immunology', 'year': 2021, 'paperId': 'b8b7b90263b1168d9466feb99ce2ce1efa7514b3', 'citationCount': 603, 'openAccessPdf': 'https://www.nature.com/articles/s41577-021-00592-1.pdf', 'authors': ['J. Tregoning', 'Katie E. Flight', 'Sophie L. Higham', 'Ziyin Wang', 'B. F. Pierce'], 'externalIds': {'PubMedCentral': '8351583', 'DOI': '10.1038/s41577-021-00592-1', 'CorpusId': 236968006, 'PubMed': '34373623'}}
" ], "text/plain": [ "" @@ -619,7 +614,7 @@ "\n", "# query the citation query engine\n", "response = query_engine.query(query_string)\n", - "display_response(response, show_source=True, source_length=100)" + "display_response(response, show_source=True, source_length=100, show_source_metadata=True)" ] }, { From 64a495de026a5ffcfcd5a7a9230309532fe630b1 Mon Sep 17 00:00:00 2001 From: shauryr Date: Thu, 24 Aug 2023 20:38:01 -0500 Subject: [PATCH 7/8] added full text details --- llama_hub/semanticscholar/README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/llama_hub/semanticscholar/README.md b/llama_hub/semanticscholar/README.md index 3010c578b9..04c804d313 100644 --- a/llama_hub/semanticscholar/README.md +++ b/llama_hub/semanticscholar/README.md @@ -4,6 +4,23 @@ Welcome to Semantic Scholar Loader. This module serves as a crucial utility for For any research topic you are interested in, this loader reads relevant papers from a search result in Semantic Scholar into `Documents`. +Please go through [demo_s2.ipynb](demo_s2.ipynb) + +## Some preliminaries - + +- `query_space` : broad area of research +- `query_string` : a specific question to the documents in the query space + +**UPDATE** : + +To download the open access pdfs and extract text from them, simply mark the `full_text` flag as `True` : + + +```python +s2reader = SemanticScholarReader() +documents = s2reader.load_data(query_space, total_papers, full_text=True) +``` + ## Usage Here is an example of how to use this loader in `llama_index` and get citations for a given query. From 801e498cb045be91dddcc8ae3c0506b45805f2f5 Mon Sep 17 00:00:00 2001 From: shauryr Date: Thu, 24 Aug 2023 21:18:40 -0500 Subject: [PATCH 8/8] fixes --- llama_hub/semanticscholar/demo_s2.ipynb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_hub/semanticscholar/demo_s2.ipynb b/llama_hub/semanticscholar/demo_s2.ipynb index 1bea569e50..31933ab42b 100644 --- a/llama_hub/semanticscholar/demo_s2.ipynb +++ b/llama_hub/semanticscholar/demo_s2.ipynb @@ -286,7 +286,6 @@ ], "source": [ "query_space = \"large language models\"\n", - "query_string = \"limitations of using large language models\"\n", "full_text = True\n", "# be careful with the total_papers when full_text = True\n", "# it can take a long time to download\n", @@ -314,6 +313,8 @@ " citation_chunk_size=512,\n", ")\n", "\n", + "query_string = \"limitations of using large language models\"\n", + "\n", "# query the citation query engine\n", "response = query_engine.query(query_string)\n", "display_response(response, show_source=True, source_length=100, show_source_metadata=True)"