Skip to content

Commit

Permalink
feat(community): update embedding jina (#7292)
Browse files Browse the repository at this point in the history
Co-authored-by: Jacob Lee <[email protected]>
  • Loading branch information
axuj and jacoblee93 authored Dec 10, 2024
1 parent 77336ec commit 0c79483
Show file tree
Hide file tree
Showing 4 changed files with 197 additions and 128 deletions.
36 changes: 28 additions & 8 deletions docs/core_docs/docs/integrations/text_embedding/jina.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ Here’s how to create an instance of `JinaEmbeddings`:
import { JinaEmbeddings } from "@langchain/community/embeddings/jina";

const embeddings = new JinaEmbeddings({
apiToken: "YOUR_API_TOKEN",
model: "jina-embeddings-v2-base-en", // Optional, defaults to "jina-embeddings-v2-base-en"
apiKey: "YOUR_API_TOKEN",
model: "jina-clip-v2", // Optional, defaults to "jina-clip-v2"
});
```

If the `apiToken` is not provided, it will be read from the `JINA_API_KEY` environment variable.
If the `apiKey` is not provided, it will be read from the `JINA_API_KEY` environment variable.

## Generating Embeddings

Expand All @@ -59,10 +59,18 @@ console.log(embedding);
To generate embeddings for multiple documents, use the `embedDocuments` method.

```typescript
import { localImageToBase64 } from "@langchain/community/utils/local_image_to_base64";
const documents = [
"Document 1 text...",
"Document 2 text...",
"Document 3 text...",
"hello",
{
text: "hello",
},
{
image: "https://i.ibb.co/nQNGqL0/beach1.jpg",
},
{
image: await localImageToBase64("beach1.jpg"),
},
];

const embeddingsArray = await embeddings.embedDocuments(documents);
Expand All @@ -87,17 +95,29 @@ Here’s a complete example of how to set up and use the `JinaEmbeddings` class:

```typescript
import { JinaEmbeddings } from "@langchain/community/embeddings/jina";
import { localImageToBase64 } from "@langchain/community/embeddings/jina/util";

const embeddings = new JinaEmbeddings({
apiToken: "YOUR_API_TOKEN",
apiKey: "YOUR_API_TOKEN",
model: "jina-embeddings-v2-base-en",
});

async function runExample() {
const queryEmbedding = await embeddings.embedQuery("Example query text.");
console.log("Query Embedding:", queryEmbedding);

const documents = ["Text 1", "Text 2", "Text 3"];
const documents = [
"hello",
{
text: "hello",
},
{
image: "https://i.ibb.co/nQNGqL0/beach1.jpg",
},
{
image: await localImageToBase64("beach1.jpg"),
},
];
const documentEmbeddings = await embeddings.embedDocuments(documents);
console.log("Document Embeddings:", documentEmbeddings);
}
Expand Down
276 changes: 159 additions & 117 deletions libs/langchain-community/src/embeddings/jina.ts
Original file line number Diff line number Diff line change
@@ -1,162 +1,204 @@
import { existsSync, readFileSync } from "fs";
import { parse } from "url";
import { Embeddings, EmbeddingsParams } from "@langchain/core/embeddings";
import { Embeddings, type EmbeddingsParams } from "@langchain/core/embeddings";
import { chunkArray } from "@langchain/core/utils/chunk_array";
import { getEnvironmentVariable } from "@langchain/core/utils/env";

/**
* The default Jina API URL for embedding requests.
*/
const JINA_API_URL = "https://api.jina.ai/v1/embeddings";

/**
* Check if a URL is a local file.
* @param url - The URL to check.
* @returns True if the URL is a local file, False otherwise.
*/
function isLocal(url: string): boolean {
const urlParsed = parse(url);
if (urlParsed.protocol === null || urlParsed.protocol === "file:") {
return existsSync(urlParsed.pathname || "");
}
return false;
}
export interface JinaEmbeddingsParams extends EmbeddingsParams {
/** Model name to use */
model:
| "jina-clip-v2"
| "jina-embeddings-v3"
| "jina-colbert-v2"
| "jina-clip-v1"
| "jina-colbert-v1-en"
| "jina-embeddings-v2-base-es"
| "jina-embeddings-v2-base-code"
| "jina-embeddings-v2-base-de"
| "jina-embeddings-v2-base-zh"
| "jina-embeddings-v2-base-en"
| string;

baseUrl?: string;

/**
* Get the bytes string of a file.
* @param filePath - The path to the file.
* @returns The bytes string of the file.
*/
function getBytesStr(filePath: string): string {
const imageFile = readFileSync(filePath);
return Buffer.from(imageFile).toString("base64");
}
/**
* Timeout to use when making requests to Jina.
*/
timeout?: number;

/**
* Input parameters for the Jina embeddings
*/
export interface JinaEmbeddingsParams extends EmbeddingsParams {
/**
* The API key to use for authentication.
* If not provided, it will be read from the `JINA_API_KEY` environment variable.
* The maximum number of documents to embed in a single request.
*/
apiKey?: string;
batchSize?: number;

/**
* The model ID to use for generating embeddings.
* Default: `jina-embeddings-v2-base-en`
* Whether to strip new lines from the input text.
*/
model?: string;
}
stripNewLines?: boolean;

/**
* Response from the Jina embeddings API.
*/
export interface JinaEmbeddingsResponse {
/**
* The embeddings generated for the input texts.
* The dimensions of the embedding.
*/
data: { index: number; embedding: number[] }[];
dimensions?: number;

/**
* The detail of the response e.g usage, model used etc.
* Scales the embedding so its Euclidean (L2) norm becomes 1, preserving direction. Useful when downstream involves dot-product, classification, visualization..
*/
detail?: string;
normalized?: boolean;
}

/**
* A class for generating embeddings using the Jina API.
* @example
* ```typescript
* // Embed a query using the JinaEmbeddings class
* const model = new JinaEmbeddings();
* const res = await model.embedQuery(
* "What would be a good name for a semantic search engine ?",
* );
* console.log({ res });
* ```
*/
export class JinaEmbeddings extends Embeddings implements JinaEmbeddingsParams {
apiKey: string;
type JinaMultiModelInput =
| {
text: string;
image?: never;
}
| {
image: string;
text?: never;
};

model: string;
export type JinaEmbeddingsInput = string | JinaMultiModelInput;

interface EmbeddingCreateParams {
model: JinaEmbeddingsParams["model"];

/**
* Constructor for the JinaEmbeddings class.
* @param fields - An optional object with properties to configure the instance.
* input can be strings or JinaMultiModelInputs,if you want embed image,you should use JinaMultiModelInputs
*/
constructor(fields?: Partial<JinaEmbeddingsParams> & { verbose?: boolean }) {
const fieldsWithDefaults = {
model: "jina-embeddings-v2-base-en",
...fields,
};
input: JinaEmbeddingsInput[];
dimensions: number;
task: "retrieval.query" | "retrieval.passage";
normalized?: boolean;
}

interface EmbeddingResponse {
model: string;
object: string;
usage: {
total_tokens: number;
prompt_tokens: number;
};
data: {
object: string;
index: number;
embedding: number[];
}[];
}

interface EmbeddingErrorResponse {
detail: string;
}

export class JinaEmbeddings extends Embeddings implements JinaEmbeddingsParams {
model: JinaEmbeddingsParams["model"] = "jina-clip-v2";

batchSize = 24;

baseUrl = "https://api.jina.ai/v1/embeddings";

stripNewLines = true;

dimensions = 1024;

apiKey: string;

normalized = true;

constructor(
fields?: Partial<JinaEmbeddingsParams> & {
apiKey?: string;
}
) {
const fieldsWithDefaults = { maxConcurrency: 2, ...fields };
super(fieldsWithDefaults);

const apiKey =
fieldsWithDefaults?.apiKey ||
getEnvironmentVariable("JINA_API_KEY") ||
getEnvironmentVariable("JINA_AUTH_TOKEN");

if (!apiKey) {
throw new Error("Jina API key not found");
}
if (!apiKey) throw new Error("Jina API key not found");

this.model = fieldsWithDefaults?.model ?? this.model;
this.apiKey = apiKey;

this.model = fieldsWithDefaults?.model ?? this.model;
this.dimensions = fieldsWithDefaults?.dimensions ?? this.dimensions;
this.batchSize = fieldsWithDefaults?.batchSize ?? this.batchSize;
this.stripNewLines =
fieldsWithDefaults?.stripNewLines ?? this.stripNewLines;
this.normalized = fieldsWithDefaults?.normalized ?? this.normalized;
}

/**
* Generates embeddings for an array of inputs.
* @param input - An array of strings or objects to generate embeddings for.
* @returns A Promise that resolves to an array of embeddings.
*/
// eslint-disable-next-line @typescript-eslint/no-explicit-any
private async _embed(input: any): Promise<number[][]> {
const response = await fetch(JINA_API_URL, {
method: "POST",
headers: {
Authorization: `Bearer ${this.apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({ input, model: this.model }),
private doStripNewLines(input: JinaEmbeddingsInput[]) {
if (this.stripNewLines) {
return input.map((i) => {
if (typeof i === "string") {
return i.replace(/\n/g, " ");
}
if (i.text) {
return { text: i.text.replace(/\n/g, " ") };
}
return i;
});
}
return input;
}

async embedDocuments(input: JinaEmbeddingsInput[]): Promise<number[][]> {
const batches = chunkArray(this.doStripNewLines(input), this.batchSize);
const batchRequests = batches.map((batch) => {
const params = this.getParams(batch);
return this.embeddingWithRetry(params);
});

const json = (await response.json()) as JinaEmbeddingsResponse;
const batchResponses = await Promise.all(batchRequests);
const embeddings: number[][] = [];

if (!json.data) {
throw new Error(json.detail || "Unknown error from Jina API");
for (let i = 0; i < batchResponses.length; i += 1) {
const batch = batches[i];
const batchResponse = batchResponses[i] || [];
for (let j = 0; j < batch.length; j += 1) {
embeddings.push(batchResponse[j]);
}
}

const sortedEmbeddings = json.data.sort((a, b) => a.index - b.index);

return sortedEmbeddings.map((item) => item.embedding);
return embeddings;
}

/**
* Generates embeddings for an array of texts.
* @param texts - An array of strings to generate embeddings for.
* @returns A Promise that resolves to an array of embeddings.
*/
async embedDocuments(texts: string[]): Promise<number[][]> {
return this._embed(texts);
}
async embedQuery(input: JinaEmbeddingsInput): Promise<number[]> {
const params = this.getParams(this.doStripNewLines([input]), true);

/**
* Generates an embedding for a single text.
* @param text - A string to generate an embedding for.
* @returns A Promise that resolves to an array of numbers representing the embedding.
*/
async embedQuery(text: string): Promise<number[]> {
const embeddings = await this._embed([text]);
const embeddings = (await this.embeddingWithRetry(params)) || [[]];
return embeddings[0];
}

/**
* Generates embeddings for an array of image URIs.
* @param uris - An array of image URIs to generate embeddings for.
* @returns A Promise that resolves to an array of embeddings.
*/
async embedImages(uris: string[]): Promise<number[][]> {
const input = uris.map((uri) => (isLocal(uri) ? getBytesStr(uri) : uri));
return this._embed(input);
private getParams(
input: JinaEmbeddingsInput[],
query?: boolean
): EmbeddingCreateParams {
return {
model: this.model,
input,
dimensions: this.dimensions,
task: query ? "retrieval.query" : "retrieval.passage",
normalized: this.normalized,
};
}

private async embeddingWithRetry(body: EmbeddingCreateParams) {
const response = await fetch(this.baseUrl, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
},
body: JSON.stringify(body),
});
const embeddingData: EmbeddingResponse | EmbeddingErrorResponse =
await response.json();
if ("detail" in embeddingData && embeddingData.detail) {
throw new Error(`${embeddingData.detail}`);
}
return (embeddingData as EmbeddingResponse).data.map(
({ embedding }) => embedding
);
}
}
Loading

0 comments on commit 0c79483

Please sign in to comment.