From 195a6d7f27902c786a49a67b0025e8f1d71e4033 Mon Sep 17 00:00:00 2001 From: Mathieu Giorgino Date: Mon, 2 Dec 2024 21:03:27 +0100 Subject: [PATCH] feat(community): Add jira document loader filtering on creation date --- examples/src/document_loaders/jira.ts | 6 +- .../document_loaders/tests/jira.int.test.ts | 230 +++++++++++++----- .../src/document_loaders/web/jira.ts | 41 +++- 3 files changed, 202 insertions(+), 75 deletions(-) diff --git a/examples/src/document_loaders/jira.ts b/examples/src/document_loaders/jira.ts index 0d412203623a..73d52efdf76f 100644 --- a/examples/src/document_loaders/jira.ts +++ b/examples/src/document_loaders/jira.ts @@ -1,9 +1,9 @@ import { JiraProjectLoader } from "@langchain/community/document_loaders/web/jira"; -const host = process.env.JIRA_HOST; +const host = process.env.JIRA_HOST || 'https://jira.example.com'; const username = process.env.JIRA_USERNAME; const accessToken = process.env.JIRA_ACCESS_TOKEN; -const projectKey = process.env.JIRA_PROJECT_KEY; +const projectKey = process.env.JIRA_PROJECT_KEY || 'PROJ'; if (username && accessToken) { // Created within last 30 days @@ -18,7 +18,7 @@ if (username && accessToken) { }); const documents = await loader.load(); - console.log(documents); + console.log(`Loaded ${documents.length} Jira document(s)`); } else { console.log( "You must provide a username and access token to run this example." diff --git a/libs/langchain-community/src/document_loaders/tests/jira.int.test.ts b/libs/langchain-community/src/document_loaders/tests/jira.int.test.ts index 42ec79c7395a..e01d1d65663b 100644 --- a/libs/langchain-community/src/document_loaders/tests/jira.int.test.ts +++ b/libs/langchain-community/src/document_loaders/tests/jira.int.test.ts @@ -3,82 +3,118 @@ */ import { Document } from "@langchain/core/documents"; import { expect, test } from "@jest/globals"; -import { JiraProjectLoader } from "../web/jira.js"; +import { + JiraIssue, + JiraProjectLoader, + JiraProjectLoaderParams, +} from "../web/jira.js"; describe("JiraProjectLoader Integration Tests", () => { const JIRA_HOST = requireEnvVar("JIRA_HOST"); const JIRA_USERNAME = requireEnvVar("JIRA_USERNAME"); const JIRA_ACCESS_TOKEN = requireEnvVar("JIRA_ACCESS_TOKEN"); const JIRA_PROJECT_KEY = requireEnvVar("JIRA_PROJECT_KEY"); + const jiraConf: JiraProjectLoaderParams = { + host: JIRA_HOST, + projectKey: JIRA_PROJECT_KEY, + username: JIRA_USERNAME, + accessToken: JIRA_ACCESS_TOKEN, + limitPerRequest: 20, + }; - function requireEnvVar(name: string): string { - // eslint-disable-next-line no-process-env - const value = process.env[name]; - if (!value) { - throw new Error(`environment variable "${name}" must be set`); - } - return value; - } + test("should load Jira project issues as documents successfully", async () => { + const docs = await loadJiraDocsUntil((docs) => docs.length > 0); - async function loadJiraDocs({createdAfter = undefined}: {createdAfter?: Date} = {}): Promise { - const loader = new JiraProjectLoader({ - host: JIRA_HOST, - projectKey: JIRA_PROJECT_KEY, - username: JIRA_USERNAME, - accessToken: JIRA_ACCESS_TOKEN, - limitPerRequest: 20, - createdAfter - }); - - return loader.load(); - } - - test("should load Jira project issues successfully", async () => { - const now = new Date(); - let months = 1; + expect(docs).toBeDefined(); + expect(Array.isArray(docs)).toBe(true); - let docs: Document[] = []; - while (docs.length === 0 && months < 120) { - const createdAfter = new Date(now); - createdAfter.setDate(now.getDate() - months * 30); - docs = await loadJiraDocs({createdAfter}); - months *= 1.2; + if (docs.length < 1) { + // Skip test if not enough issues available + return; } + const firstDoc = docs[0]; + + // Check document structure + expect(firstDoc).toHaveProperty("pageContent"); + expect(firstDoc).toHaveProperty("metadata"); + + // Check metadata + expect(firstDoc.metadata).toHaveProperty("id"); + expect(firstDoc.metadata).toHaveProperty("host", JIRA_HOST); + expect(firstDoc.metadata).toHaveProperty("projectKey", JIRA_PROJECT_KEY); + + // Check pageContent contains essential Jira issue information + const content = firstDoc.pageContent; + expect(content).toContain("Issue:"); + expect(content).toContain("Project:"); + expect(content).toContain("Status:"); + expect(content).toContain("Priority:"); + expect(content).toContain("Type:"); + expect(content).toContain("Creator:"); + }); - if (months >= 10) { - docs = await loadJiraDocs({}); + test("should filter issues based on createdAfter date", async () => { + // First load at least 2 issues with different creation dates (ignoring time) + const baseIssues = await loadJiraIssuesUntil(haveTwoDifferentCreationDates); + if (baseIssues.length < 2) { + // Skip test if not enough issues available + return; } - expect(docs).toBeDefined(); - expect(Array.isArray(docs)).toBe(true); + // Create a map from date string without time to list of issues + const dateToIssueMap = new Map(); + baseIssues.forEach((issue) => { + const date = asStringWithoutTime(new Date(issue.fields.created)); + dateToIssueMap.set(date, (dateToIssueMap.get(date) ?? []).concat(issue)); + }); + // Convert map to list of {date, issues} + const issuesGroupedByDate = Array.from( + dateToIssueMap, + ([date, issues]) => ({ date, issues }) + ); + issuesGroupedByDate.sort((a, b) => a.date.localeCompare(b.date)); + + // Pick middle date to split issues in two groups + const middleIndex = Math.floor(issuesGroupedByDate.length / 2); + const middleDate = new Date(issuesGroupedByDate[middleIndex].date); + const issuesAfterMiddle = issuesGroupedByDate + .slice(middleIndex) + .flatMap(({ issues }) => issues); + + // Load issues created after middle date + const loader = new JiraProjectLoader({ + ...jiraConf, + createdAfter: middleDate, + }); - if (docs.length > 0) { - const firstDoc = docs[0]; - - // Check document structure - expect(firstDoc).toHaveProperty("pageContent"); - expect(firstDoc).toHaveProperty("metadata"); - - // Check metadata - expect(firstDoc.metadata).toHaveProperty("id"); - expect(firstDoc.metadata).toHaveProperty("host", JIRA_HOST); - expect(firstDoc.metadata).toHaveProperty("projectKey", JIRA_PROJECT_KEY); - - // Check pageContent contains essential Jira issue information - const content = firstDoc.pageContent; - expect(content).toContain("Issue:"); - expect(content).toContain("Project:"); - expect(content).toContain("Status:"); - expect(content).toContain("Priority:"); - expect(content).toContain("Type:"); - expect(content).toContain("Creator:"); - } + const filteredDocs = await loader.load(); + + // Verify we got the expected issues + expect(filteredDocs.length).toBeGreaterThan(0); + expect(filteredDocs.length).toBeLessThan(baseIssues.length); + + // Verify all returned issues are created after our cutoff date + const middleDateTimestamp = middleDate.getTime(); + filteredDocs.forEach((doc) => { + const issueDateString = doc.pageContent + .split("\n") + .filter((line) => /^Created: /.test(line))[0] + .replace("Created: ", ""); + const issueDateTimestamp = new Date( + asStringWithoutTime(new Date(issueDateString)) + ).getTime(); + expect(issueDateTimestamp).toBeGreaterThanOrEqual(middleDateTimestamp); + }); + + // Verify we got the same issues as in our original set + const filteredIds = new Set(filteredDocs.map((d) => d.metadata.id)); + const expectedIds = new Set(issuesAfterMiddle.map((issue) => issue.id)); + expect(filteredIds).toEqual(expectedIds); }); test("should handle invalid credentials", async () => { const loader = new JiraProjectLoader({ - host: JIRA_HOST, - projectKey: JIRA_PROJECT_KEY, + ...jiraConf, username: "invalid_username", accessToken: "invalid_token", }); @@ -89,13 +125,85 @@ describe("JiraProjectLoader Integration Tests", () => { test("should handle invalid project key", async () => { const loader = new JiraProjectLoader({ - host: JIRA_HOST, + ...jiraConf, projectKey: "INVALID_PROJECT_KEY", - username: JIRA_USERNAME, - accessToken: JIRA_ACCESS_TOKEN, }); const docs = await loader.load(); expect(docs).toEqual([]); }); + + function requireEnvVar(name: string): string { + // eslint-disable-next-line no-process-env + const value = process.env[name]; + if (!value) { + throw new Error(`environment variable "${name}" must be set`); + } + return value; + } + + function asStringWithoutTime(date: Date): string { + return date.toISOString().split("T")[0]; + } + + function sameDate(a: Date, b: Date) { + return asStringWithoutTime(a) === asStringWithoutTime(b); + } + + function haveTwoDifferentCreationDates(issues: JiraIssue[]): boolean { + return ( + issues.length >= 2 && + issues + .slice(1) + .some( + (issue) => + !sameDate( + new Date(issue.fields.created), + new Date(issues[0].fields.created) + ) + ) + ); + } + + async function loadJiraDocsUntil(predicate: (docs: Document[]) => boolean) { + const load = (createdAfter: Date) => + new JiraProjectLoader({ + ...jiraConf, + createdAfter, + }).load(); + return loadUntil(load, predicate); + } + + async function loadJiraIssuesUntil( + predicate: (docs: JiraIssue[]) => boolean + ) { + const load = (createdAfter: Date) => + new JiraProjectLoader({ + ...jiraConf, + createdAfter, + }).loadAsIssues(); + return loadUntil(load, predicate); + } + + async function loadUntil( + loadCreatedAfter: (date: Date) => Promise, + predicate: (loaded: T[]) => boolean + ): Promise { + const now = new Date(); + let months = 1; + const maxMonths = 120; + + let loaded: T[] = []; + while (!predicate(loaded) && months < maxMonths) { + const createdAfter = new Date(now); + createdAfter.setDate(now.getDate() - months * 30); + loaded = await loadCreatedAfter(createdAfter); + months *= 1.2; + } + + if (months >= maxMonths) { + return []; + } + return loaded; + } }); diff --git a/libs/langchain-community/src/document_loaders/web/jira.ts b/libs/langchain-community/src/document_loaders/web/jira.ts index 631bbbf31eb9..59e0879d2ab9 100644 --- a/libs/langchain-community/src/document_loaders/web/jira.ts +++ b/libs/langchain-community/src/document_loaders/web/jira.ts @@ -369,34 +369,53 @@ export class JiraProjectLoader extends BaseDocumentLoader { } public async load(): Promise { - const allIssues: JiraIssue[] = []; - try { - for await (const issues of this.fetchIssues()) { - allIssues.push(...issues); - } - - return this.documentConverter.convertToDocuments(allIssues); + const allJiraIssues = await this.loadAsIssues(); + return this.documentConverter.convertToDocuments(allJiraIssues); } catch (error) { console.error("Error:", error); return []; } } + public async loadAsIssues(): Promise { + const allIssues: JiraIssue[] = []; + + for await (const issues of this.fetchIssues()) { + allIssues.push(...issues); + } + + return allIssues; + } + + protected toJiraDateString(date: Date | undefined): string | undefined { + if (!date) { + return undefined; + } + const year = date.getFullYear(); + const month = String(date.getMonth() + 1).padStart(2, "0"); + const dayOfMonth = String(date.getDate()).padStart(2, "0"); + return `${year}-${month}-${dayOfMonth}`; + } + protected async *fetchIssues(): AsyncIterable { const authorizationHeader = this.buildAuthorizationHeader(); const url = `${this.host}${API_ENDPOINTS.SEARCH}`; + const createdAfterAsString = this.toJiraDateString(this.createdAfter); let startAt = 0; while (true) { try { const jqlProps = [ `project=${this.projectKey}`, - `startAt=${startAt}`, - `maxResults=${this.limitPerRequest}`, - ...(this.createdAfter ? [`created>${this.createdAfter.toISOString()}`] : []) + ...(createdAfterAsString ? [`created>=${createdAfterAsString}`] : []), ]; - const pageUrl = `${url}?jql=${jqlProps.join('&')}`; + const params = new URLSearchParams({ + jql: jqlProps.join(" AND "), + startAt: `${startAt}`, + maxResults: `${this.limitPerRequest}`, + }); + const pageUrl = `${url}?${params}`; const options = { method: "GET",