Skip to content

Commit

Permalink
Format, rename, fix docs
Browse files Browse the repository at this point in the history
  • Loading branch information
jacoblee93 committed Dec 24, 2024
1 parent d169dde commit 58931bf
Show file tree
Hide file tree
Showing 8 changed files with 527 additions and 522 deletions.
20 changes: 12 additions & 8 deletions docs/core_docs/docs/integrations/retrievers/arxiv-retriever.mdx
Original file line number Diff line number Diff line change
@@ -1,40 +1,41 @@
# ArxivRetriever

---

## Overview

The `arXiv Retriever` allows users to query the arXiv database for academic articles. It supports both full-document retrieval (PDF parsing) and summary-based retrieval. For detailed documentation of all ArxivRetriever features and configurations, head to [API reference](#https://arxiv.org/)

## Features

- Query Flexibility: Search using natural language queries or specific arXiv IDs.
- Full-Document Retrieval: Option to fetch and parse PDFs.
- Summaries as Documents: Retrieve summaries for faster results.
- Customizable Options: Configure maximum results and output format.

## Integration details

| Retriever | Source | Package |
| ---------------- | ---------------------------- | --------------------------------------- |
| Retriever | Source | Package |
| ---------------- | ---------------------------- | ---------------------------------------------------------------------------- |
| `ArxivRetriever` | Academic articles from arXiv | [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) |

## Setup/Installation

Ensure the following dependencies are installed:

- `pdf-parse` for parsing PDFs
- `fast-xml-parser` for parsing XML responses from the arXiv API

```npm2yarn
npm install pdf-parse fast-xml-parser
```
---


## Instantiate the retriever

```typescript
const retriever = new ArxivRetriever({
getFullDocuments: false, // Set to true to fetch full documents (PDFs)
maxSearchResults: 5, // Maximum number of results to retrieve
returnFullDocuments: false, // Set to true to fetch full documents (PDFs)
maxSearchResults: 5, // Maximum number of results to retrieve
});
```

Expand All @@ -46,7 +47,7 @@ Use the `invoke` method to search arXiv for relevant articles. You can use eithe
const query = "quantum computing";

const documents = await retriever.invoke(query);
documents.forEach(doc => {
documents.forEach((doc) => {
console.log("Title:", doc.metadata.title);
console.log("Content:", doc.pageContent); // Parsed PDF content
});
Expand All @@ -59,7 +60,10 @@ Like other retrievers, `ArxivRetriever` can be incorporated into LLM application
```typescript
import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { RunnablePassthrough, RunnableSequence } from "@langchain/core/runnables";
import {
RunnablePassthrough,
RunnableSequence,
} from "@langchain/core/runnables";
import { StringOutputParser } from "@langchain/core/output_parsers";
import type { Document } from "@langchain/core/documents";

Expand Down
18 changes: 8 additions & 10 deletions examples/src/retrievers/arxiv.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { ArxivRetriever } from "../../../libs/langchain-community/src/retrievers/arxiv.js";
import { ArxivRetriever } from "@langchain/community/retrievers/arxiv";

export const run = async () => {
/*
Expand All @@ -7,8 +7,8 @@ export const run = async () => {

const queryId = "1605.08386 2103.03404";
const retrieverById = new ArxivRetriever({
getFullDocuments: true,
maxSearchResults: 5
returnFullDocuments: true,
maxSearchResults: 5,
});
const documentsById = await retrieverById.invoke(queryId);
console.log(documentsById);
Expand Down Expand Up @@ -41,12 +41,10 @@ export const run = async () => {
*/

const queryNat = "What is the ImageBind model?";
const retrieverByNat = new ArxivRetriever(
{
getFullDocuments: false,
maxSearchResults: 2
}
);
const retrieverByNat = new ArxivRetriever({
returnFullDocuments: false,
maxSearchResults: 2,
});
const documentsByQuery = await retrieverByNat.invoke(queryNat);
console.log(documentsByQuery);

Expand All @@ -64,4 +62,4 @@ export const run = async () => {
}
]
*/
};
};
1 change: 1 addition & 0 deletions libs/langchain-community/langchain.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,7 @@ export const config = {
"chat_models/zhipuai",
"retrievers/amazon_kendra",
"retrievers/amazon_knowledge_base",
"retrievers/arxiv",
"retrievers/dria",
"retrievers/metal",
"retrievers/supabase",
Expand Down
1 change: 1 addition & 0 deletions libs/langchain-community/src/load/import_constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ export const optionalImportEntrypoints: string[] = [
"langchain_community/callbacks/handlers/upstash_ratelimit",
"langchain_community/retrievers/amazon_kendra",
"langchain_community/retrievers/amazon_knowledge_base",
"langchain_community/retrievers/arxiv",
"langchain_community/retrievers/dria",
"langchain_community/retrievers/metal",
"langchain_community/retrievers/supabase",
Expand Down
1 change: 0 additions & 1 deletion libs/langchain-community/src/load/import_map.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ export * as chat_models__novita from "../chat_models/novita.js";
export * as chat_models__ollama from "../chat_models/ollama.js";
export * as chat_models__togetherai from "../chat_models/togetherai.js";
export * as chat_models__yandex from "../chat_models/yandex.js";
export * as retrievers__arxiv from "../retrievers/arxiv.js";
export * as retrievers__bm25 from "../retrievers/bm25.js";
export * as retrievers__chaindesk from "../retrievers/chaindesk.js";
export * as retrievers__databerry from "../retrievers/databerry.js";
Expand Down
62 changes: 34 additions & 28 deletions libs/langchain-community/src/retrievers/arxiv.ts
Original file line number Diff line number Diff line change
@@ -1,45 +1,51 @@
import { BaseRetriever, BaseRetrieverInput } from "@langchain/core/retrievers";
import { Document } from "@langchain/core/documents";
import { searchArxiv, loadDocsFromResults, getDocsFromSummaries } from '../utils/arxiv.js';
import {
searchArxiv,
loadDocsFromResults,
getDocsFromSummaries,
} from "../utils/arxiv.js";

export type ArxivRetrieverOptions = {
getFullDocuments?: boolean;
maxSearchResults?: number;
returnFullDocuments?: boolean;
maxSearchResults?: number;
} & BaseRetrieverInput;

/**
* A retriever that searches arXiv for relevant articles based on a query.
* It can retrieve either full documents (PDFs) or just summaries.
*/
export class ArxivRetriever extends BaseRetriever {
static lc_name() {
return "ArxivRetriever";
}
static lc_name() {
return "ArxivRetriever";
}

lc_namespace = ["langchain", "retrievers", "arxiv_retriever"];
lc_namespace = ["langchain", "retrievers", "arxiv_retriever"];

getFullDocuments: boolean;
maxSearchResults: number;
returnFullDocuments = false;

constructor(options: ArxivRetrieverOptions = {}) {
super(options);
this.getFullDocuments = options.getFullDocuments ?? false;
this.maxSearchResults = options.maxSearchResults ?? 10;
}
maxSearchResults = 10;

constructor(options: ArxivRetrieverOptions = {}) {
super(options);
this.returnFullDocuments =
options.returnFullDocuments ?? this.returnFullDocuments;
this.maxSearchResults = options.maxSearchResults ?? this.maxSearchResults;
}

async _getRelevantDocuments(query: string): Promise<Document[]> {
try {
const results = await searchArxiv(query, this.maxSearchResults);

async _getRelevantDocuments(query: string): Promise<Document[]> {
try {
const results = await searchArxiv(query, this.maxSearchResults);

if (this.getFullDocuments) {
// Fetch and parse PDFs to get full documents
return await loadDocsFromResults(results);
} else {
// Use summaries as documents
return getDocsFromSummaries(results);
}
} catch (error) {
throw new Error(`Error retrieving documents from arXiv.`);
}
if (this.returnFullDocuments) {
// Fetch and parse PDFs to get full documents
return await loadDocsFromResults(results);
} else {
// Use summaries as documents
return getDocsFromSummaries(results);
}
} catch (error) {
throw new Error(`Error retrieving documents from arXiv.`);
}
}
}
Loading

0 comments on commit 58931bf

Please sign in to comment.