diff --git a/README.txt b/README.txt index a713b803d..c143f3db6 100755 --- a/README.txt +++ b/README.txt @@ -1,4 +1,4 @@ ---------------------------- DATAFARI 6.1-DEV ------------------------ +--------------------------- DATAFARI 6.2-DEV ------------------------ Datafari is an open source enterprise search solution. It is the perfect product for anyone who needs to search and analyze its corporate data and documents, both within the content and the metadata. diff --git a/datafari-active-directory-connector/pom.xml b/datafari-active-directory-connector/pom.xml index e0d925e8b..1d59bc557 100644 --- a/datafari-active-directory-connector/pom.xml +++ b/datafari-active-directory-connector/pom.xml @@ -6,7 +6,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community datafari-active-directory-connector @@ -16,7 +16,7 @@ com.francelabs.datafari datafari-mcf-connectors-dependencies - 6.1-dev-Community + 6.2-dev-Community pom provided diff --git a/datafari-adminui/pom.xml b/datafari-adminui/pom.xml index 498bac5c2..99a3b58a6 100644 --- a/datafari-adminui/pom.xml +++ b/datafari-adminui/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-adminui diff --git a/datafari-analytic-stack/pom.xml b/datafari-analytic-stack/pom.xml index 7594c7693..7297f8ef6 100644 --- a/datafari-analytic-stack/pom.xml +++ b/datafari-analytic-stack/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-analytic-stack diff --git a/datafari-cassandra/pom.xml b/datafari-cassandra/pom.xml index fc6ca6154..0c453e442 100644 --- a/datafari-cassandra/pom.xml +++ b/datafari-cassandra/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-cassandra diff --git a/datafari-confluence-v6-connector/pom.xml b/datafari-confluence-v6-connector/pom.xml index 0b33a3273..346fef0d2 100755 --- a/datafari-confluence-v6-connector/pom.xml +++ b/datafari-confluence-v6-connector/pom.xml @@ -6,7 +6,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community datafari-confluence-v6-connector @@ -16,7 +16,7 @@ com.francelabs.datafari datafari-mcf-connectors-dependencies - 6.1-dev-Community + 6.2-dev-Community pom provided diff --git a/datafari-core/pom.xml b/datafari-core/pom.xml index 6100f82ca..18e0bd94a 100755 --- a/datafari-core/pom.xml +++ b/datafari-core/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-core diff --git a/datafari-docfilter-connector/pom.xml b/datafari-docfilter-connector/pom.xml index bf4c38556..54464fa06 100755 --- a/datafari-docfilter-connector/pom.xml +++ b/datafari-docfilter-connector/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-docfilter-connector @@ -14,7 +14,7 @@ com.francelabs.datafari datafari-mcf-connectors-dependencies - 6.1-dev-Community + 6.2-dev-Community pom provided diff --git a/datafari-emptier-connector/pom.xml b/datafari-emptier-connector/pom.xml index 75bfb7b19..59ed70704 100755 --- a/datafari-emptier-connector/pom.xml +++ b/datafari-emptier-connector/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-emptier-connector @@ -14,7 +14,7 @@ com.francelabs.datafari datafari-mcf-connectors-dependencies - 6.1-dev-Community + 6.2-dev-Community pom provided diff --git a/datafari-git-plugin/pom.xml b/datafari-git-plugin/pom.xml index 2d18e078b..c4143f7c8 100644 --- a/datafari-git-plugin/pom.xml +++ b/datafari-git-plugin/pom.xml @@ -6,7 +6,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community datafari-git-plugin war diff --git a/datafari-handler/pom.xml b/datafari-handler/pom.xml index 6832a7d96..452ecf74f 100644 --- a/datafari-handler/pom.xml +++ b/datafari-handler/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-handler diff --git a/datafari-jena/pom.xml b/datafari-jena/pom.xml index 2f6fc485d..5e290dde2 100644 --- a/datafari-jena/pom.xml +++ b/datafari-jena/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-jena diff --git a/datafari-llm-connector/.gitignore b/datafari-llm-connector/.gitignore deleted file mode 100644 index e3e5bc015..000000000 --- a/datafari-llm-connector/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/target/ -/.settings \ No newline at end of file diff --git a/datafari-llm-connector/pom.xml b/datafari-llm-connector/pom.xml deleted file mode 100644 index 63eb7997b..000000000 --- a/datafari-llm-connector/pom.xml +++ /dev/null @@ -1,104 +0,0 @@ - - 4.0.0 - - com.francelabs.datafari - datafari-ce - 6.1-dev-Community - - datafari-llm-connector - Datafari Enterprise Search - LLM connector module - - - - - com.francelabs.datafari - datafari-mcf-connectors-dependencies - 6.1-dev-Community - pom - provided - - - javax.servlet - javax.servlet-api - 3.1.0 - compile - - - org.springframework - spring-web - ${spring.version} - compile - - - dev.langchain4j - langchain4j-core - 0.34.0 - compile - - - dev.langchain4j - langchain4j-open-ai - 0.33.0 - compile - - - dev.langchain4j - langchain4j - 0.34.0 - compile - - - dev.langchain4j - langchain4j-embeddings-all-minilm-l6-v2 - 0.34.0 - - - org.junit.jupiter - junit-jupiter - 5.9.1 - test - - - - - - - - ./src/main/native2ascii - - **/*.properties - - - - - src/main/resources - - **/*.html - **/*.js - - - - - - - - maven-assembly-plugin - - - package - - single - - - - - - jar-with-dependencies - - - - - - - - \ No newline at end of file diff --git a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/Llm.java b/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/Llm.java deleted file mode 100644 index 8747337a7..000000000 --- a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/Llm.java +++ /dev/null @@ -1,638 +0,0 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -package com.francelabs.datafari.transformation.llm; - -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.*; - -import com.francelabs.datafari.transformation.llm.connectors.DatafariLlmService; -import com.francelabs.datafari.transformation.llm.connectors.LlmService; -import com.francelabs.datafari.transformation.llm.connectors.OpenAiLlmService; -import com.francelabs.datafari.transformation.llm.model.LlmSpecification; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.apache.manifoldcf.agents.interfaces.IOutputAddActivity; -import org.apache.manifoldcf.agents.interfaces.IOutputCheckActivity; -import org.apache.manifoldcf.agents.interfaces.RepositoryDocument; -import org.apache.manifoldcf.agents.interfaces.ServiceInterruption; -import org.apache.manifoldcf.agents.system.Logging; -import org.apache.manifoldcf.agents.transformation.BaseTransformationConnector; -import org.apache.manifoldcf.core.interfaces.ConfigParams; -import org.apache.manifoldcf.core.interfaces.IHTTPOutput; -import org.apache.manifoldcf.core.interfaces.IPostParameters; -import org.apache.manifoldcf.core.interfaces.IThreadContext; -import org.apache.manifoldcf.core.interfaces.ManifoldCFException; -import org.apache.manifoldcf.core.interfaces.Specification; -import org.apache.manifoldcf.core.interfaces.SpecificationNode; -import org.apache.manifoldcf.core.interfaces.VersionContext; - -import com.francelabs.datafari.transformation.llm.utils.storage.DestinationStorage; - - - -/** - * Connector to extract entities using a regular expression from document content and put them in metadata. - * - */ -public class Llm extends BaseTransformationConnector { - - public static final String _rcsid = "@(#)$Id: "+ Llm.class.getName() + " $"; - public static final String DEFAULT_ENDPOINT = "https://api.openai.com/v1/"; - private static final int DEFAULT_MAXTOKENS = 500; - - private static final String EDIT_CONFIGURATION_JS = "editConfiguration.js"; - private static final String EDIT_CONFIGURATION_SERVER_HTML = "editConfiguration_llm.html"; - private static final String VIEW_CONFIGURATION_HTML = "viewConfiguration.html"; - private static final String EDIT_SPECIFICATION_JS = "editSpecification.js"; - private static final String EDIT_SPECIFICATION_HTML = "editSpecification_llm.html"; - private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html"; - - - protected static final String ACTIVITY_LLM = "LLM"; - protected static final String CONTENT = "content"; - protected static final String SEQNUM = "SEQNUM"; - - protected static final String[] activitiesList = new String[] { ACTIVITY_LLM }; - private static final Logger LOGGER = LogManager.getLogger(Llm.class.getName()); - - - /** - * Connect this connector. The configuration parameters are included. - * - * @param configParams are the configuration parameters for this connection. - */ - @Override - public void connect(ConfigParams configParams) { - super.connect(configParams); - } - - /** - * Close the connection. Call this before discarding the repository connector. - */ - @Override - public void disconnect() throws ManifoldCFException { - super.disconnect(); - } - - /** - * This method is periodically called for all connectors that are connected but - * not in active use. - */ - @Override - public void poll() throws ManifoldCFException { - } - - /** - * This method is called to assess whether to count this connector instance should actually be counted as being connected. - * - * @return true if the connector instance is actually connected. - */ - @Override - public boolean isConnected() { - return true; - } - - - /** - * Return a list of activities that this connector generates. The connector does NOT need to be connected before this method is called. - * - * @return the set of activities. - */ - @Override - public String[] getActivitiesList() { - return activitiesList; - } - - /** - * Output the configuration header section. This method is called in the head section of the connector's configuration page. Its purpose is to add the required tabs to the list, and to output any - * javascript methods that might be needed by the configuration editing HTML. - * - * @param threadContext is the local thread context. - * @param out is the output to which any HTML should be sent. - * @param parameters are the configuration parameters, as they currently exist, for this connection being configured. - * @param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector. - */ - @Override - public void outputConfigurationHeader(final IThreadContext threadContext, final IHTTPOutput out, final Locale locale, final ConfigParams parameters, final List tabsArray) - throws ManifoldCFException, IOException { - tabsArray.add(Messages.getString(locale, "llm.TabName")); - Messages.outputResourceWithVelocity(out, locale, EDIT_CONFIGURATION_JS, null); - } - - /** - * Output the configuration body section. This method is called in the body section of the connector's configuration page. Its purpose is to present the required form elements for editing. The coder - * can presume that the HTML that is output from this configuration will be within appropriate , , and
tags. The name of the form is "editconnection". - * - * @param threadContext is the local thread context. - * @param out is the output to which any HTML should be sent. - * @param parameters are the configuration parameters, as they currently exist, for this connection being configured. - * @param tabName is the current tab name. - */ - @Override - public void outputConfigurationBody(final IThreadContext threadContext, final IHTTPOutput out, final Locale locale, final ConfigParams parameters, final String tabName) - throws ManifoldCFException, IOException { - final Map velocityContext = new HashMap<>(); - velocityContext.put("TabName", tabName); - fillInAPITab(velocityContext, out, parameters); - Messages.outputResourceWithVelocity(out, locale, EDIT_CONFIGURATION_SERVER_HTML, velocityContext); - } - - /** - * Process a configuration post. This method is called at the start of the connector's configuration page, whenever there is a possibility that form data for a connection has been posted. Its - * purpose is to gather form information and modify the configuration parameters accordingly. The name of the posted form is "editconnection". - * - * @param threadContext is the local thread context. - * @param variableContext is the set of variables available from the post, including binary file post information. - * @param parameters are the configuration parameters, as they currently exist, for this connection being configured. - * @return null if all is well, or a string error message if there is an error that should prevent saving of the connection (and cause a redirection to an error page). - */ - @Override - public String processConfigurationPost(final IThreadContext threadContext, final IPostParameters variableContext, final Locale locale, final ConfigParams parameters) throws ManifoldCFException { - - if (variableContext.getParameter("llmService") != null) { - parameters.setParameter(LlmConfig.NODE_LLM_SERVICE, variableContext.getParameter("llmService")); - } - if (variableContext.getParameter("endpointToUse") != null) { - parameters.setParameter(LlmConfig.NODE_ENDPOINT, variableContext.getParameter("endpointToUse")); - } - if (variableContext.getParameter("llmToUse") != null) { - parameters.setParameter(LlmConfig.NODE_LLM, variableContext.getParameter("llmToUse")); - } - if (variableContext.getParameter("embeddingsModelToUse") != null) { - parameters.setParameter(LlmConfig.NODE_EMBEDDINGS_MODEL, variableContext.getParameter("embeddingsModelToUse")); - } - if (variableContext.getParameter("llmApiKey") != null) { - parameters.setParameter(LlmConfig.NODE_APIKEY, variableContext.getParameter("llmApiKey")); - } - if (variableContext.getParameter("dimensions") != null) { - parameters.setParameter(LlmConfig.NODE_VECTOR_DIMENSION, variableContext.getParameter("dimensions")); - } - - return null; - } - - /** - * View configuration. This method is called in the body section of the connector's view configuration page. Its purpose is to present the connection information to the user. The coder can presume - * that the HTML that is output from this configuration will be within appropriate and tags. - * - * @param threadContext is the local thread context. - * @param out is the output to which any HTML should be sent. - * @param parameters are the configuration parameters, as they currently exist, for this connection being configured. - */ - @Override - public void viewConfiguration(final IThreadContext threadContext, final IHTTPOutput out, final Locale locale, final ConfigParams parameters) throws ManifoldCFException, IOException { - final Map velocityContext = new HashMap<>(); - fillInAPITab(velocityContext, out, parameters); - Messages.outputResourceWithVelocity(out, locale, VIEW_CONFIGURATION_HTML, velocityContext); - } - - - protected static void fillInAPITab(final Map velocityContext, final IHTTPOutput out, final ConfigParams parameters) throws ManifoldCFException { - - String endpointToUse = (parameters.getParameter(LlmConfig.NODE_ENDPOINT) != null) ? parameters.getParameter(LlmConfig.NODE_ENDPOINT) : DEFAULT_ENDPOINT; - String llmService = (parameters.getParameter(LlmConfig.NODE_LLM_SERVICE) != null) ? parameters.getParameter(LlmConfig.NODE_LLM_SERVICE) : "openai"; - String llmToUse = (parameters.getParameter(LlmConfig.NODE_LLM) != null) ? parameters.getParameter(LlmConfig.NODE_LLM) : ""; - String embeddingsModelToUse = (parameters.getParameter(LlmConfig.NODE_EMBEDDINGS_MODEL) != null) ? parameters.getParameter(LlmConfig.NODE_EMBEDDINGS_MODEL) : ""; - String llmApiKey = (parameters.getParameter(LlmConfig.NODE_APIKEY) != null) ? parameters.getParameter(LlmConfig.NODE_APIKEY) : ""; - String dimensions = (parameters.getParameter(LlmConfig.NODE_VECTOR_DIMENSION) != null) ? parameters.getParameter(LlmConfig.NODE_VECTOR_DIMENSION) : "250"; - - - // Fill in context - velocityContext.put("ENDPOINT", endpointToUse); - velocityContext.put("LLMTYPE", llmService); - velocityContext.put("EMBEDDINGMODEL", embeddingsModelToUse); - velocityContext.put("LLM", llmToUse); - velocityContext.put("APIKEY", llmApiKey); - velocityContext.put("DIMENSIONS", dimensions); - } - - /** - * Get an output version string, given an output specification. The output version string is used to uniquely describe the pertinent details of the output specification and the configuration, to - * allow the Connector Framework to determine whether a document will need to be output again. Note that the contents of the document cannot be considered by this method, and that a different - * version string (defined in IRepositoryConnector) is used to describe the version of the actual document. - * - * This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be necessary. - * - * @param spec is the current output specification for the job that is doing the crawling. - * @return a string, of unlimited length, which uniquely describes output configuration and specification in such a way that if two such strings are equal, the document will not need to be sent - * again to the output data store. - */ - @Override - public VersionContext getPipelineDescription(final Specification spec) throws ManifoldCFException, ServiceInterruption { - String versionString = getVersionString(spec); - return new VersionContext(versionString, params, spec); - } - - // ------------------------------------------------------------------------------------------------------------------------------------------------ - // We intercept checks pertaining to the document format and send modified checks further down - // ------------------------------------------------------------------------------------------------------------------------------------------------ - - /** - * Detect if a mime type is acceptable or not. This method is used to determine whether it makes sense to fetch a document in the first place. - * - * @param pipelineDescription is the document's pipeline version string, for this connection. - * @param mimeType is the mime type of the document. - * @param checkActivity is an object including the activities that can be performed by this method. - * @return true if the mime type can be accepted by this connector. - */ - @Override - public boolean checkMimeTypeIndexable(final VersionContext pipelineDescription, final String mimeType, final IOutputCheckActivity checkActivity) throws ManifoldCFException, ServiceInterruption { - return true; - } - - /** - * Pre-determine whether a document (passed here as a File object) is acceptable or not. This method is used to determine whether a document needs to be actually transferred. This hook is provided - * mainly to support search engines that only handle a small set of accepted file types. - * - * @param pipelineDescription is the document's pipeline version string, for this connection. - * @param localFile is the local file to check. - * @param checkActivity is an object including the activities that can be done by this method. - * @return true if the file is acceptable, false if not. - */ - @Override - public boolean checkDocumentIndexable(final VersionContext pipelineDescription, final File localFile, final IOutputCheckActivity checkActivity) throws ManifoldCFException, ServiceInterruption { - // Document contents are not germane anymore, unless it looks like Tika - // won't accept them. - // Not sure how to check that... - return true; - } - - /** - * Pre-determine whether a document's length is acceptable. This method is used to determine whether to fetch a document in the first place. - * - * @param pipelineDescription is the document's pipeline version string, for this connection. - * @param length is the length of the document. - * @param checkActivity is an object including the activities that can be done by this method. - * @return true if the file is acceptable, false if not. - */ - @Override - public boolean checkLengthIndexable(final VersionContext pipelineDescription, final long length, final IOutputCheckActivity checkActivity) throws ManifoldCFException, ServiceInterruption { - // Always true - return true; - } - - // End Checks ------------------------------------------------------------------------------------------------------------------------------------- - - /** - * Add (or replace) a document in the output data store using the connector. This method presumes that the connector object has been configured, and it is thus able to communicate with the output - * data store should that be necessary. The OutputSpecification is *not* provided to this method, because the goal is consistency, and if output is done it must be consistent with the output - * description, since that was what was partly used to determine if output should be taking place. So it may be necessary for this method to decode an output description string in order to determine - * what should be done. - *

- * This override method's fonctionnality: - * - * @param documentURI is the URI of the document. The URI is presumed to be the unique identifier which the output data store will use to process and serve the document. This URI is - * constructed by the repository connector which fetches the document, and is thus universal across all output connectors. - * @param pipelineDescription is the description string that was constructed for this document by the getOutputDescription() method. - * @param document is the document data to be processed (handed to the output data store). - * @param authorityNameString is the name of the authority responsible for authorizing any access tokens passed in with the repository document. May be null. - * @param activities is the handle to an object that the implementer of a pipeline connector may use to perform operations, such as logging processing activity, or sending a modified - * document to the next stage in the pipeline. - * @return the document status (accepted or permanently rejected). - * @throws IOException only if there's a stream error reading the document data. - */ - @Override - public int addOrReplaceDocumentWithException(final String documentURI, final VersionContext pipelineDescription, final RepositoryDocument document, final String authorityNameString, - final IOutputAddActivity activities) throws ManifoldCFException, ServiceInterruption, IOException { - - - final LlmSpecification spec = new LlmSpecification(pipelineDescription.getSpecification(), getConfiguration()); - - boolean hasError = false; - final long startTime = System.currentTimeMillis(); - - // Prepare storage for reading document content. A suitable storage depending on content size. - DestinationStorage storage = DestinationStorage.getDestinationStorage(document.getBinaryLength(), getClass()); - StringBuilder contentBuilder = new StringBuilder(); - try { - // Reading file content - try { - // Transfert document content to the storage - long binaryLength = document.getBinaryStream().transferTo(storage.getOutputStream()); - - // The input stream of the document has been totally read by previous instruction, so set a new one - document.setBinary(storage.getInputStream(), binaryLength); - - // Prepare reading of document copied to extract metadata - BufferedReader buffRead = new BufferedReader(new InputStreamReader(storage.getInputStream())); - - // Read lines - String line = buffRead.readLine(); - while (line != null ) { - contentBuilder.append(line); - line = buffRead.readLine(); - } - buffRead.close(); - - } catch (Exception e) { - hasError = true; - activities.recordActivity(startTime, ACTIVITY_LLM, document.getBinaryLength(), documentURI, "KO", e.getMessage()); - Logging.ingest.error("Unable to browse document " + documentURI, e); - } - - String content = contentBuilder.toString(); - - if (content.length() > 20000) { - content = content.substring(0, 20000); - } - - // Select the proper service depending on the LLM - LlmService service; - switch (spec.getTypeOfLlm()) { - case "openai": - service = new OpenAiLlmService(spec); - break; - case "datafari": - default: - service = new DatafariLlmService(spec); - break; - } - - - // SUMMARIZE DOCUMENTS - if (spec.getEnableSummarize()) { - try { - String summary = service.summarize(content, spec); - if (summary.isEmpty()) throw new RuntimeException("Could not generate a summary for document: " + documentURI); - document.addField("llm_summary", summary); - } catch (Exception e) { - LOGGER.warn("Could not generate a summary for document: {}", documentURI); - } - } - - // CATEGORIZE DOCUMENTS - // Invoice, Call for Tenders, Request for Quotations, Technical paper, Presentation, Resumes, Others - if (spec.getEnableCategorize()) { - try { - String category = extractCategory(service.summarize(content, spec)); - if (category.isEmpty()) throw new RuntimeException("Could not generate a summary for document: " + documentURI); - document.addField("llm_categories", category); - } catch (Exception e) { - LOGGER.warn("Could not find category for document: {}", documentURI); - } - } - - - // EMBBED DOCUMENTS - // Send chunk to LLM for embedding - if (spec.getEnableVectorEmbedding()) { - - if (content.length() > 15000) { - content = content.substring(0, 15000); - } - float[] response = service.embeddings(content); - String[] strvector = new String[response.length]; - for (int i = 0; i < response.length; i++ ) { - strvector[i] = Float.toString(response[i]); - } - document.addField("llm_vector", strvector); - } - - - - if (!hasError) activities.recordActivity(startTime, ACTIVITY_LLM, document.getBinaryLength(), documentURI, "OK", ""); - return activities.sendDocument(documentURI, document); - - } finally { - // Clean storage (for instance, delete temporary file) after all treatment on document done (Solr indexing). - storage.close(); - } - - } - - /** - * Extract the category from the LLM response - * - * @param message The LLM esponse - * @return The category - */ - public String extractCategory(String message) { - String[] categories = {"Invoice", "Call for Tenders", "Request for Quotations", "Technical paper", "Presentation", "Resumes"}; - for (String category : categories) { - if (message.contains(category)) return category; - } - return "Others"; - } - - - /** - * Obtain the name of the form check javascript method to call. - * - * @param connectionSequenceNumber is the unique number of this connection within the job. - * @return the name of the form check javascript method. - */ - @Override - public String getFormCheckJavascriptMethodName(final int connectionSequenceNumber) { - return "s" + connectionSequenceNumber + "_checkSpecification"; - } - - /** - * Obtain the name of the form presave check javascript method to call. - * - * @param connectionSequenceNumber is the unique number of this connection within the job. - * @return the name of the form presave check javascript method. - */ - @Override - public String getFormPresaveCheckJavascriptMethodName(final int connectionSequenceNumber) { - return "s" + connectionSequenceNumber + "_checkSpecificationForSave"; - } - - /** - * Output the specification header section. This method is called in the head section of a job page which has selected a pipeline connection of the current type. Its purpose is to add the required - * tabs to the list, and to output any javascript methods that might be needed by the job editing HTML. - * - * @param out is the output to which any HTML should be sent. - * @param locale is the preferred local of the output. - * @param spec is the current pipeline specification for this connection. - * @param connectionSequenceNumber is the unique number of this connection within the job. - * @param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector. - */ - @Override - public void outputSpecificationHeader(final IHTTPOutput out, final Locale locale, final Specification spec, final int connectionSequenceNumber, final List tabsArray) - throws ManifoldCFException, IOException { - final Map paramMap = new HashMap<>(); - paramMap.put(SEQNUM, Integer.toString(connectionSequenceNumber)); - - tabsArray.add(Messages.getString(locale, "llm.TabName")); - - // Fill in the specification header map, using data from all tabs. - fillInLlmSpecificationMap(paramMap, spec); - - Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_JS, paramMap); - } - - /** - * Output the specification body section. This method is called in the body section of a job page which has selected a pipeline connection of the current type. Its purpose is to present the required - * form elements for editing. The coder can presume that the HTML that is output from this configuration will be within appropriate , , and tags. The name of the form is - * "editjob". - * - * @param out is the output to which any HTML should be sent. - * @param locale is the preferred local of the output. - * @param spec is the current pipeline specification for this job. - * @param connectionSequenceNumber is the unique number of this connection within the job. - * @param actualSequenceNumber is the connection within the job that has currently been selected. - * @param tabName is the current tab name. - */ - @Override - public void outputSpecificationBody(final IHTTPOutput out, final Locale locale, final Specification spec, final int connectionSequenceNumber, final int actualSequenceNumber, final String tabName) - throws ManifoldCFException, IOException { - final Map paramMap = new HashMap<>(); - - // Set the tab name - paramMap.put("TABNAME", tabName); - paramMap.put(SEQNUM, Integer.toString(connectionSequenceNumber)); - paramMap.put("SELECTEDNUM", Integer.toString(actualSequenceNumber)); - - // Fill in the field mapping tab data - fillInLlmSpecificationMap(paramMap, spec); - - - Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_HTML, paramMap); - } - - /** - * Process a specification post. This method is called at the start of job's edit or view page, whenever there is a possibility that form data for a connection has been posted. Its purpose is to - * gather form information and modify the transformation specification accordingly. The name of the posted form is "editjob". - * - * @param variableContext contains the post data, including binary file-upload information. - * @param locale is the preferred local of the output. - * @param spec is the current pipeline specification for this job. - * @param connectionSequenceNumber is the unique number of this connection within the job. - * @return null if all is well, or a string error message if there is an error that should prevent saving of the job (and cause a redirection to an error page). - */ - @Override - public String processSpecificationPost(final IPostParameters variableContext, final Locale locale, final Specification spec, final int connectionSequenceNumber) throws ManifoldCFException { - - final String seqPrefix = "s" + connectionSequenceNumber + "_"; - - addChildToSpec(variableContext, spec, seqPrefix + "enableSummarize", LlmConfig.NODE_ENABLE_SUMMARIZE); - addChildToSpec(variableContext, spec, seqPrefix + "enableCategorize", LlmConfig.NODE_ENABLE_CATEGORIZE); - addChildToSpec(variableContext, spec, seqPrefix + "enableEmbeddings", LlmConfig.NODE_ENABLE_EMBEDDINGS); - addChildToSpec(variableContext, spec, seqPrefix + "maxTokens", LlmConfig.NODE_MAXTOKENS); - addChildToSpec(variableContext, spec, seqPrefix + "summariesLanguage", LlmConfig.NODE_SUMMARIES_LANGUAGE); - - return null; - } - - private static void addChildToSpec(IPostParameters variableContext, Specification spec, String fieldName, String nodeName) { - final SpecificationNode node = new SpecificationNode(nodeName); - final String value = variableContext.getParameter(fieldName); - if (value != null) { - node.setAttribute(LlmConfig.ATTRIBUTE_VALUE, value); - } else { - node.setAttribute(LlmConfig.ATTRIBUTE_VALUE, ""); - } - spec.addChild(spec.getChildCount(), node); - } - - - /** - * View specification. This method is called in the body section of a job's view page. Its purpose is to present the pipeline specification information to the user. The coder can presume that the - * HTML that is output from this configuration will be within appropriate and tags. - * - * @param out is the output to which any HTML should be sent. - * @param locale is the preferred local of the output. - * @param connectionSequenceNumber is the unique number of this connection within the job. - * @param spec is the current pipeline specification for this job. - */ - @Override - public void viewSpecification(final IHTTPOutput out, final Locale locale, final Specification spec, final int connectionSequenceNumber) throws ManifoldCFException, IOException { - final Map paramMap = new HashMap<>(); - paramMap.put(SEQNUM, Integer.toString(connectionSequenceNumber)); - - // Fill in the map with data from all tabs - fillInLlmSpecificationMap(paramMap, spec); - - Messages.outputResourceWithVelocity(out, locale, VIEW_SPECIFICATION_HTML, paramMap); - - } - - protected static void fillInLlmSpecificationMap(final Map paramMap, final Specification os) { - // Prep for field mappings - String enableSummarize = "false"; - String enableCategorize = "false"; - String enableEmbeddings = "false"; - int maxTokens = 400; - String summariesLanguage = ""; - - for (int i = 0; i < os.getChildCount(); i++) { - final SpecificationNode sn = os.getChild(i); - if (sn.getType().equals(LlmConfig.NODE_ENABLE_SUMMARIZE)) { - enableSummarize = sn.getAttributeValue(LlmConfig.ATTRIBUTE_VALUE); - } else if (sn.getType().equals(LlmConfig.NODE_ENABLE_CATEGORIZE)) { - enableCategorize = sn.getAttributeValue(LlmConfig.ATTRIBUTE_VALUE); - } else if (sn.getType().equals(LlmConfig.NODE_ENABLE_EMBEDDINGS)) { - enableEmbeddings = sn.getAttributeValue(LlmConfig.ATTRIBUTE_VALUE); - } else if (sn.getType().equals(LlmConfig.NODE_MAXTOKENS)) { - try { - maxTokens = Integer.parseInt(sn.getAttributeValue(LlmConfig.ATTRIBUTE_VALUE)); - } catch (NumberFormatException ex) { - maxTokens = DEFAULT_MAXTOKENS; - } - } else if (sn.getType().equals(LlmConfig.NODE_SUMMARIES_LANGUAGE)) { - summariesLanguage = sn.getAttributeValue(LlmConfig.ATTRIBUTE_VALUE); - } - } - paramMap.put("ENABLESUMMARIZE", enableSummarize); - paramMap.put("ENABLECATEGORIZE", enableCategorize); - paramMap.put("ENABLEEMBEDDINGS", enableEmbeddings); - paramMap.put("MAXTOKENS", maxTokens); - paramMap.put("SUMMARIESLANGUAGE", summariesLanguage); - } - - /** - * Create a Version String for this connector configuration. To be used by getPipelineDescription(). - * - * @param spec the specification object associated with this connector. - * @return the Version String - */ - protected String getVersionString(Specification spec) { - StringBuilder versionString = new StringBuilder(); - - // Browse specification nodes and their attributes - int nbNodes = spec.getChildCount(); - SpecificationNode specNode; - Iterator itAttributesName; - String attributeValue; - String attributeName; - for (int i=0; i < nbNodes; i++) { - specNode = spec.getChild(i); - if (i > 0) { - versionString.append('+'); - } - versionString.append(specNode.getType()); - - itAttributesName = specNode.getAttributes(); - while (itAttributesName.hasNext()) { - attributeName = itAttributesName.next(); - attributeValue = specNode.getAttributeValue(attributeName); - - if (!attributeValue.isEmpty()) { - versionString.append('+'); - versionString.append(attributeName); - versionString.append(':'); - versionString.append(attributeValue); - } - } - } - return versionString.toString(); - } - -} diff --git a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/LlmConfig.java b/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/LlmConfig.java deleted file mode 100644 index 647231b79..000000000 --- a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/LlmConfig.java +++ /dev/null @@ -1,20 +0,0 @@ -package com.francelabs.datafari.transformation.llm; - -public class LlmConfig { - - // Configuration parameters - public static final String NODE_ENDPOINT = "llmNodeEndpoint"; - public static final String NODE_LLM = "llmNodeLlmToUse"; - public static final String NODE_EMBEDDINGS_MODEL = "llmNodeEmbeddingsModel"; - public static final String NODE_APIKEY = "llmNodeApiKey"; - public static final String NODE_VECTOR_DIMENSION = "llmNodeVectorDimension"; - - // Specification nodes and values - public static final String NODE_LLM_SERVICE = "llmService"; - public static final String NODE_ENABLE_SUMMARIZE = "enableSummarize"; - public static final String NODE_ENABLE_CATEGORIZE = "enableCategorize"; - public static final String NODE_ENABLE_EMBEDDINGS = "enableEmbeddings"; - public static final String NODE_MAXTOKENS = "maxTokens"; - public static final String NODE_SUMMARIES_LANGUAGE = "summariesLanguage"; - public static final String ATTRIBUTE_VALUE = "value"; -} diff --git a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/Messages.java b/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/Messages.java deleted file mode 100644 index 22eceb526..000000000 --- a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/Messages.java +++ /dev/null @@ -1,97 +0,0 @@ -package com.francelabs.datafari.transformation.llm; - -import java.util.Locale; -import java.util.Map; - -import org.apache.manifoldcf.core.interfaces.IHTTPOutput; -import org.apache.manifoldcf.core.interfaces.ManifoldCFException; - -public class Messages extends org.apache.manifoldcf.ui.i18n.Messages { - public static final String DEFAULT_BUNDLE_NAME = "com.francelabs.datafari.transformation.llm.common"; - public static final String DEFAULT_PATH_NAME = "com.francelabs.datafari.transformation.llm"; - - /** - * Constructor - do no instantiate - */ - protected Messages() { - } - - public static String getString(final Locale locale, final String messageKey) { - return getString(DEFAULT_BUNDLE_NAME, locale, messageKey, null); - } - - public static String getAttributeString(final Locale locale, final String messageKey) { - return getAttributeString(DEFAULT_BUNDLE_NAME, locale, messageKey, null); - } - - public static String getBodyString(final Locale locale, final String messageKey) { - return getBodyString(DEFAULT_BUNDLE_NAME, locale, messageKey, null); - } - - public static String getAttributeJavascriptString(final Locale locale, final String messageKey) { - return getAttributeJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, null); - } - - public static String getBodyJavascriptString(final Locale locale, final String messageKey) { - return getBodyJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, null); - } - - public static String getString(final Locale locale, final String messageKey, final Object[] args) { - return getString(DEFAULT_BUNDLE_NAME, locale, messageKey, args); - } - - public static String getAttributeString(final Locale locale, final String messageKey, final Object[] args) { - return getAttributeString(DEFAULT_BUNDLE_NAME, locale, messageKey, args); - } - - public static String getBodyString(final Locale locale, final String messageKey, final Object[] args) { - return getBodyString(DEFAULT_BUNDLE_NAME, locale, messageKey, args); - } - - public static String getAttributeJavascriptString(final Locale locale, final String messageKey, final Object[] args) { - return getAttributeJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, args); - } - - public static String getBodyJavascriptString(final Locale locale, final String messageKey, final Object[] args) { - return getBodyJavascriptString(DEFAULT_BUNDLE_NAME, locale, messageKey, args); - } - - // More general methods which allow bundlenames and class loaders to be specified. - - public static String getString(final String bundleName, final Locale locale, final String messageKey, final Object[] args) { - return getString(Messages.class, bundleName, locale, messageKey, args); - } - - public static String getAttributeString(final String bundleName, final Locale locale, final String messageKey, final Object[] args) { - return getAttributeString(Messages.class, bundleName, locale, messageKey, args); - } - - public static String getBodyString(final String bundleName, final Locale locale, final String messageKey, final Object[] args) { - return getBodyString(Messages.class, bundleName, locale, messageKey, args); - } - - public static String getAttributeJavascriptString(final String bundleName, final Locale locale, final String messageKey, final Object[] args) { - return getAttributeJavascriptString(Messages.class, bundleName, locale, messageKey, args); - } - - public static String getBodyJavascriptString(final String bundleName, final Locale locale, final String messageKey, final Object[] args) { - return getBodyJavascriptString(Messages.class, bundleName, locale, messageKey, args); - } - - // Resource output - - public static void outputResource(final IHTTPOutput output, final Locale locale, final String resourceKey, final Map substitutionParameters, final boolean mapToUpperCase) - throws ManifoldCFException { - outputResource(output, Messages.class, DEFAULT_PATH_NAME, locale, resourceKey, substitutionParameters, mapToUpperCase); - } - - public static void outputResourceWithVelocity(final IHTTPOutput output, final Locale locale, final String resourceKey, final Map substitutionParameters, final boolean mapToUpperCase) - throws ManifoldCFException { - outputResourceWithVelocity(output, Messages.class, DEFAULT_BUNDLE_NAME, DEFAULT_PATH_NAME, locale, resourceKey, substitutionParameters, mapToUpperCase); - } - - public static void outputResourceWithVelocity(final IHTTPOutput output, final Locale locale, final String resourceKey, final Map contextObjects) throws ManifoldCFException { - outputResourceWithVelocity(output, Messages.class, DEFAULT_BUNDLE_NAME, DEFAULT_PATH_NAME, locale, resourceKey, contextObjects); - } - -} diff --git a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/connectors/DatafariLlmService.java b/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/connectors/DatafariLlmService.java deleted file mode 100644 index 3406e5098..000000000 --- a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/connectors/DatafariLlmService.java +++ /dev/null @@ -1,145 +0,0 @@ -package com.francelabs.datafari.transformation.llm.connectors; - -import com.francelabs.datafari.transformation.llm.model.LlmSpecification; -import com.francelabs.datafari.transformation.llm.utils.PromptUtils; -import dev.langchain4j.data.embedding.Embedding; -import dev.langchain4j.model.embedding.EmbeddingModel; -import dev.langchain4j.model.embedding.onnx.allminilml6v2.AllMiniLmL6V2EmbeddingModel; -import dev.langchain4j.model.output.Response; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.springframework.http.HttpEntity; -import org.springframework.http.HttpHeaders; -import org.springframework.http.MediaType; -import org.springframework.web.client.RestClientException; -import org.springframework.web.client.RestTemplate; - -import java.io.IOException; - -public class DatafariLlmService implements LlmService { - - private static final Logger LOGGER = LogManager.getLogger(DatafariLlmService.class.getName()); - - String url; - String temperature; - int maxToken; - String model; - String apiKey; // If not used in the future, this might be removed - LlmSpecification spec; - - public DatafariLlmService(LlmSpecification spec) { - this.url = spec.getLlmEndpoint(); - this.temperature = "0"; - this.maxToken = spec.getMaxTokens(); - this.model = spec.getLlm(); - this.apiKey = spec.getApiKey(); - this.spec = spec; - } - - /** - * Call the Datafari External LLM Webservice - * @param prompts A list of prompts. Each prompt contains instructions for the model, document content and the user query - * @return The string LLM response - */ - public String generate(String prompts) { - - try { - - RestTemplate template = new RestTemplate(); - HttpHeaders headers = new HttpHeaders(); - headers.setContentType(MediaType.APPLICATION_JSON); - if (!apiKey.isEmpty()) headers.setBearerAuth(apiKey); - HttpEntity requestEntity = new HttpEntity<>(prompts, headers); - String resp = ""; - String endpoint = url + "/invoke"; - try{ - DatafariRagResponse response = template.postForObject(endpoint, requestEntity, DatafariRagResponse.class); - if (response == null) { - throw new RestClientException("An error occurred while calling external webservices. The response does not provide any information"); - } else if (response.getOutput().isEmpty() && !response.getError().isEmpty()) { - throw new RestClientException("An error occurred while calling external webservices: " + response.getError()); - } else if (response.getOutput().isEmpty() && response.getError().isEmpty()) { - throw new RestClientException("An error occurred while calling external webservices: the response output is empty."); - } else { - resp = response.getOutput(); - } - - if (resp == null) throw new RestClientException("An error occurred while calling external webservices: " + response.getError()); - resp = response.getOutput(); - - return resp; - } catch(NullPointerException e){ - throw new RestClientException("An error occurred while calling external webservices.", e); - } - - - } catch (Exception e) { - throw new RestClientException("An error occurred while calling external webservices.", e); - } - } - - /** - * Generate the body attached to the request sent to the LLM - * @param prompt A single String prompt. Each prompt contains instructions for the model, document content and the user query - * @return A JSON String - */ - public String generateRequestBody(String prompt) { - - JSONObject queryBody = new JSONObject(); - JSONObject input = new JSONObject(); - JSONArray queries = new JSONArray(); - if (!temperature.isEmpty()) input.put("temperature", temperature); - if (maxToken == 0) input.put("max_tokens", maxToken); - if (!model.isEmpty()) input.put("model", model); - - JSONObject query = new JSONObject(); - query.put("content", prompt); - queries.add(query); - - input.put("queries", queries); - queryBody.put("input", input); - return queryBody.toJSONString(); - } - - @Override - public String invoke(String content) throws IOException { - return generate(content); - } - - @Override - public float[] embeddings(String content) throws IOException { - - // Embedding the document - EmbeddingModel embeddingModel = new AllMiniLmL6V2EmbeddingModel(); - Response embedding = embeddingModel.embed(content); - - LOGGER.info("Vector embedding : {}", embedding); - return embedding.content().vector(); - } - - @Override - public String summarize(String content, LlmSpecification spec) throws IOException { - String prompt = PromptUtils.promptForSummarization(content, spec); - return invoke(prompt); - } - - @Override - public String categorize(String content) throws IOException { - String prompt = PromptUtils.promptForCategorization(content); - return invoke(prompt); - } -} - -class DatafariRagResponse { - - String output; - public String getOutput() { - return output; - } - String error; - public String getError() { - return error; - } -} \ No newline at end of file diff --git a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/connectors/LlmService.java b/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/connectors/LlmService.java deleted file mode 100644 index f0087945d..000000000 --- a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/connectors/LlmService.java +++ /dev/null @@ -1,33 +0,0 @@ -package com.francelabs.datafari.transformation.llm.connectors; - -import com.francelabs.datafari.transformation.llm.model.LlmSpecification; - -import java.io.IOException; - -public interface LlmService { - /** - * - * @param content The document content - * @return The string LLM response - */ - String invoke(String content) throws IOException; - - /** - * @param content The document content - * @return The string LLM response - */ - float[] embeddings(String content) throws IOException; - - /** - * @param content The document content - * @return The string LLM response - */ - String summarize(String content, LlmSpecification spec) throws IOException; - - /** - * @param content The document content - * @return The string LLM response - */ - String categorize(String content) throws IOException; - -} diff --git a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/connectors/OpenAiLlmService.java b/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/connectors/OpenAiLlmService.java deleted file mode 100644 index a58f54572..000000000 --- a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/connectors/OpenAiLlmService.java +++ /dev/null @@ -1,95 +0,0 @@ -package com.francelabs.datafari.transformation.llm.connectors; - -import com.francelabs.datafari.transformation.llm.model.LlmSpecification; -import com.francelabs.datafari.transformation.llm.utils.PromptUtils; -import dev.langchain4j.data.embedding.Embedding; -import dev.langchain4j.model.chat.ChatLanguageModel; -import dev.langchain4j.model.embedding.EmbeddingModel; -import dev.langchain4j.model.openai.OpenAiChatModel; -import dev.langchain4j.model.openai.OpenAiEmbeddingModel; -import dev.langchain4j.model.output.Response; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import java.io.IOException; - -public class OpenAiLlmService implements LlmService { - - private static final Logger LOGGER = LogManager.getLogger(OpenAiLlmService.class.getName()); - LlmSpecification spec; - - double temperature; - int maxToken; - int dimensions = 124; - static final String DEFAULT_LLM_MODEL = "gpt-3.5-turbo"; - static final String DEFAULT_EMBEDDINGS_MODEL = "text-embedding-3-small"; - static final int DEFAULT_DIMENSION = 250; - static final String DEFAULT_URL = "https://api.openai.com/v1/"; - - public OpenAiLlmService(LlmSpecification spec) { - this.temperature = 0; - try { - this.maxToken = spec.getMaxTokens(); - } catch (NumberFormatException e) { - spec.setMaxTokens(200); - } - try { - this.dimensions = (spec.getVectorDimension() < 1) ? DEFAULT_DIMENSION : spec.getVectorDimension(); - } catch (NumberFormatException e) { - spec.setVectorDimension(124); - } - - if (spec.getLlm().isEmpty()) spec.setLlm(DEFAULT_LLM_MODEL); - if (spec.getEmbeddingsModel().isEmpty()) spec.setEmbeddingsModel(DEFAULT_EMBEDDINGS_MODEL); - if (spec.getLlmEndpoint().isEmpty()) spec.setLlmEndpoint(DEFAULT_URL); - - this.spec = spec; - } - - - /** - * Call the Datafari External LLM Webservice - * @param prompt A ready-to-use prompt for the LLM - * @return The string LLM response - */ - @Override - public String invoke(String prompt) throws IOException { - - ChatLanguageModel llm = OpenAiChatModel.builder() - .apiKey(spec.getApiKey()) - .temperature(temperature) - .maxTokens(spec.getMaxTokens()) - .modelName(spec.getLlm()) - .baseUrl(spec.getLlmEndpoint()) - .build(); - - return llm.generate(prompt); - } - - @Override - public float[] embeddings(String content) throws IOException { - - EmbeddingModel llm = OpenAiEmbeddingModel.builder() - .apiKey(spec.getApiKey()) - .modelName(spec.getEmbeddingsModel()) - .baseUrl(spec.getLlmEndpoint()) - .dimensions(spec.getVectorDimension()) - .build(); - - Response embedding = llm.embed(content); - LOGGER.info("Vector embedding : {}", embedding); - return embedding.content().vector(); - } - - @Override - public String summarize(String content, LlmSpecification spec) throws IOException { - String prompt = PromptUtils.promptForSummarization(content, spec); - return invoke(prompt); - } - - @Override - public String categorize(String content) throws IOException { - String prompt = PromptUtils.promptForCategorization(content); - return invoke(prompt); - } -} \ No newline at end of file diff --git a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/model/LlmSpecification.java b/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/model/LlmSpecification.java deleted file mode 100644 index 15551ca65..000000000 --- a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/model/LlmSpecification.java +++ /dev/null @@ -1,168 +0,0 @@ -package com.francelabs.datafari.transformation.llm.model; - -import com.francelabs.datafari.transformation.llm.LlmConfig; -import org.apache.manifoldcf.core.interfaces.ConfigNode; -import org.apache.manifoldcf.core.interfaces.ConfigParams; -import org.apache.manifoldcf.core.interfaces.Specification; -import org.apache.manifoldcf.core.interfaces.SpecificationNode; - -import java.util.Objects; - -/** - * This class contains specifications for the LLM Connector. Each object matches a line in the LLM specifications tab. - */ -public class LlmSpecification { - - String summariesLanguage = "en_US"; - boolean enableSummarize = false; - boolean enableCategorize = false; - boolean enableVectorEmbedding = false; - String llmEndpoint = ""; - int vectorDimension = 250; - String apiKey = ""; - String llm = ""; - String embeddingsModel = ""; - int maxTokens = 500; - String typeOfLlm = ""; - - public LlmSpecification() { - // Empty connector - } - - public LlmSpecification(Specification os, ConfigParams config) { - - for (int i = 0; i < os.getChildCount(); i++) { - final SpecificationNode sn = os.getChild(i); - - if (sn.getType().equals(LlmConfig.NODE_ENABLE_SUMMARIZE)) { - this.enableSummarize = "true".equals(sn.getAttributeValue(LlmConfig.ATTRIBUTE_VALUE)); - } else if (sn.getType().equals(LlmConfig.NODE_ENABLE_CATEGORIZE)) { - this.enableCategorize = "true".equals(sn.getAttributeValue(LlmConfig.ATTRIBUTE_VALUE)); - } else if (sn.getType().equals(LlmConfig.NODE_ENABLE_EMBEDDINGS)) { - this.enableVectorEmbedding = "true".equals(sn.getAttributeValue(LlmConfig.ATTRIBUTE_VALUE)); - } else if (sn.getType().equals(LlmConfig.NODE_MAXTOKENS)) { - this.maxTokens = Integer.parseInt(sn.getAttributeValue(LlmConfig.ATTRIBUTE_VALUE)); - } else if (sn.getType().equals(LlmConfig.NODE_SUMMARIES_LANGUAGE)) { - this.summariesLanguage = sn.getAttributeValue(LlmConfig.ATTRIBUTE_VALUE); - } - } - for (int i = 0; i < config.getChildCount(); i++) { - final ConfigNode cn = config.getChild(i); - - if (cn.getAttributeValue("name").equals(LlmConfig.NODE_ENDPOINT)) { - this.llmEndpoint = cn.getValue(); - } else if (cn.getAttributeValue("name").equals(LlmConfig.NODE_VECTOR_DIMENSION)) { - this.vectorDimension = Integer.parseInt(cn.getValue()); - } else if (cn.getAttributeValue("name").equals(LlmConfig.NODE_APIKEY)) { - this.apiKey = cn.getValue(); - } else if (cn.getAttributeValue("name").equals(LlmConfig.NODE_LLM)) { - this.llm = cn.getValue(); - } else if (cn.getAttributeValue("name").equals(LlmConfig.NODE_EMBEDDINGS_MODEL)) { - this.embeddingsModel = cn.getValue(); - } else if (cn.getAttributeValue("name").equals(LlmConfig.NODE_LLM_SERVICE)) { - this.typeOfLlm = cn.getValue(); - } - } - } - - public boolean getEnableSummarize() { - return enableSummarize; - } - - public void setEnableSummarize(Boolean enableSummarize) { - this.enableSummarize = enableSummarize; - } - - public boolean getEnableCategorize() { - return enableCategorize; - } - - public void setEnableCategorize(Boolean enableCategorize) { - this.enableCategorize = enableCategorize; - } - - public boolean getEnableVectorEmbedding() { - return enableVectorEmbedding; - } - - public void setEnableVectorEmbedding(Boolean enableVectorEmbedding) { - this.enableVectorEmbedding = enableVectorEmbedding; - } - - public String getLlmEndpoint() { - return llmEndpoint; - } - - public void setLlmEndpoint(String llmEndpoint) { - this.llmEndpoint = llmEndpoint; - } - - public int getVectorDimension() { - return vectorDimension; - } - - public void setVectorDimension(int vectorDimension) { - this.vectorDimension = vectorDimension; - } - - public String getApiKey() { - return apiKey; - } - - public void setApiKey(String apiKey) { - this.apiKey = apiKey; - } - - public String getEmbeddingsModel() { - return embeddingsModel; - } - - public void setEmbeddingsModel(String embeddingsModel) { - this.embeddingsModel = embeddingsModel; - } - - public String getSummariesLanguage() { - return summariesLanguage; - } - - public void setSummariesLanguage(String summariesLanguage) { - this.summariesLanguage = summariesLanguage; - } - - public String getLlm() { - return llm; - } - - public void setLlm(String llm) { - this.llm = llm; - } - - public String getTypeOfLlm() { - return typeOfLlm; - } - - public void setTypeOfLlm(String typeOfLlm) { - this.typeOfLlm = typeOfLlm; - } - - public int getMaxTokens() { - return maxTokens; - } - - public void setMaxTokens(int maxTokens) { - this.maxTokens = maxTokens; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - LlmSpecification that = (LlmSpecification) o; - return vectorDimension == that.vectorDimension && maxTokens == that.maxTokens && Objects.equals(summariesLanguage, that.summariesLanguage) && Objects.equals(enableSummarize, that.enableSummarize) && Objects.equals(enableCategorize, that.enableCategorize) && Objects.equals(enableVectorEmbedding, that.enableVectorEmbedding) && Objects.equals(llmEndpoint, that.llmEndpoint) && Objects.equals(apiKey, that.apiKey) && Objects.equals(llm, that.llm) && Objects.equals(embeddingsModel, that.embeddingsModel) && Objects.equals(typeOfLlm, that.typeOfLlm); - } - - @Override - public int hashCode() { - return Objects.hash(summariesLanguage, enableSummarize, enableCategorize, enableVectorEmbedding, llmEndpoint, vectorDimension, apiKey, llm, embeddingsModel, maxTokens, typeOfLlm); - } -} diff --git a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/utils/PromptUtils.java b/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/utils/PromptUtils.java deleted file mode 100755 index 58682fb3d..000000000 --- a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/utils/PromptUtils.java +++ /dev/null @@ -1,64 +0,0 @@ -/******************************************************************************* - * Copyright 2015 France Labs - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - *******************************************************************************/ -package com.francelabs.datafari.transformation.llm.utils; - - -import com.francelabs.datafari.transformation.llm.model.LlmSpecification; - -import java.util.Locale; - -/** - * Prompt Utility class for RAG - * - * @author France Labs - * - */ -public class PromptUtils { - - private PromptUtils() { - // Constructor - } - - /** - * Create a prompt for summarization - * - * @param content : The document content - * @return a prompt ready to be sent to the LLM service - */ - public static String promptForSummarization(String content, LlmSpecification spec) { - String prompt; - String language = ""; - if (!spec.getSummariesLanguage().isEmpty()) { - Locale loc = new Locale(spec.getSummariesLanguage()); - if (!loc.getDisplayLanguage(new Locale("en")).isEmpty()) language = " in " +loc.getDisplayLanguage(Locale.ENGLISH); - } - prompt = "\"\"\"Summarize this document " + language + ": \n\n" + content + "\"\"\""; - return prompt; - } - - /** - * Create a prompt for summarization - * - * @param content : The document content - * @return a prompt ready to be sent to the LLM service - */ - public static String promptForCategorization(String content) { - String prompt; - prompt = "\"\"\"Categorize the following document in one of the following categories: Invoice, Call for Tenders, Request for Quotations, Technical paper, Presentation, Resumes, Others. If you don't know, say \"Others\". \n\n" + content.substring(0, 30000) + "\"\"\""; - return prompt; - } - -} diff --git a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/utils/storage/DestinationStorage.java b/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/utils/storage/DestinationStorage.java deleted file mode 100644 index f01e4d4d6..000000000 --- a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/utils/storage/DestinationStorage.java +++ /dev/null @@ -1,57 +0,0 @@ -package com.francelabs.datafari.transformation.llm.utils.storage; - -import com.francelabs.datafari.transformation.llm.utils.storage.FileDestinationStorage; -import com.francelabs.datafari.transformation.llm.utils.storage.MemoryDestinationStorage; -import org.apache.manifoldcf.core.interfaces.ManifoldCFException; - -import java.io.InputStream; -import java.io.OutputStream; - -/** - * A suitable storage object (File, in memory...) to write to. - * - */ -public abstract class DestinationStorage { - /** We handle up to 64K in memory; after that we go to disk. */ - private static final long IN_MEMORY_MAXIMUM_FILE = 65536; - - - /** - * Get the output stream to write to. Caller should explicitly close this stream when done writing. - */ - public abstract OutputStream getOutputStream() throws ManifoldCFException; - - /** - * Get new binary length. - */ - public abstract long getBinaryLength() throws ManifoldCFException; - - /** - * Get the input stream to read from. Caller should explicitly close this stream when done reading. - */ - public abstract InputStream getInputStream() throws ManifoldCFException; - - /** - * Close the object and clean up everything. This should be called when the data is no longer needed. - */ - public abstract void close() throws ManifoldCFException; - - /** - * @param binaryLength content length - * @param classUsing class using this storage. - * - * @return the created storage object - * - * @throws ManifoldCFException - */ - public static DestinationStorage getDestinationStorage(long binaryLength, Class classUsing) throws ManifoldCFException { - DestinationStorage ds; - if (binaryLength <= IN_MEMORY_MAXIMUM_FILE) { - ds = new MemoryDestinationStorage((int)binaryLength); - } else { - ds = new FileDestinationStorage(classUsing); - } - return ds; - } - -} diff --git a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/utils/storage/FileDestinationStorage.java b/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/utils/storage/FileDestinationStorage.java deleted file mode 100644 index c874056c9..000000000 --- a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/utils/storage/FileDestinationStorage.java +++ /dev/null @@ -1,72 +0,0 @@ -package com.francelabs.datafari.transformation.llm.utils.storage; - -import org.apache.manifoldcf.core.interfaces.ManifoldCFException; - -import java.io.*; - -/** - * A temporary file to store data. - * - */ -public class FileDestinationStorage extends DestinationStorage { - protected final File outputFile; - protected final OutputStream outputStream; - - - public FileDestinationStorage(Class classUsing) throws ManifoldCFException { - File outputFile; - OutputStream outputStream; - - String prefix; - if (classUsing != null) { - prefix = classUsing.getSimpleName(); - } else { - prefix = FileDestinationStorage.class.getSimpleName(); - } - - try { - outputFile = File.createTempFile(prefix, "tmp"); - outputStream = new FileOutputStream(outputFile); - } catch (final IOException e) { - handleIOException(e); - outputFile = null; - outputStream = null; - } - this.outputFile = outputFile; - this.outputStream = outputStream; - } - - @Override - public OutputStream getOutputStream() throws ManifoldCFException { - return outputStream; - } - - @Override - public long getBinaryLength() throws ManifoldCFException { - return outputFile.length(); - } - - @Override - public InputStream getInputStream() throws ManifoldCFException { - try { - return new FileInputStream(outputFile); - } catch (final IOException e) { - handleIOException(e); - return null; - } - } - - @Override - public void close() throws ManifoldCFException { - outputFile.delete(); - } - - private int handleIOException(final IOException e) throws ManifoldCFException { - // IOException reading from our local storage... - if (e instanceof InterruptedIOException) { - throw new ManifoldCFException(e.getMessage(), e, ManifoldCFException.INTERRUPTED); - } - throw new ManifoldCFException(e.getMessage(), e); - } - -} diff --git a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/utils/storage/MemoryDestinationStorage.java b/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/utils/storage/MemoryDestinationStorage.java deleted file mode 100644 index d643036bb..000000000 --- a/datafari-llm-connector/src/main/java/com/francelabs/datafari/transformation/llm/utils/storage/MemoryDestinationStorage.java +++ /dev/null @@ -1,43 +0,0 @@ -package com.francelabs.datafari.transformation.llm.utils.storage; - -import com.francelabs.datafari.transformation.llm.utils.storage.DestinationStorage; -import org.apache.manifoldcf.core.interfaces.ManifoldCFException; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.InputStream; -import java.io.OutputStream; - -/** - * An in-memory stream - * - */ -public class MemoryDestinationStorage extends DestinationStorage { - private final ByteArrayOutputStream outputStream; - - - public MemoryDestinationStorage(final int sizeHint) { - outputStream = new ByteArrayOutputStream(sizeHint); - } - - - @Override - public OutputStream getOutputStream() throws ManifoldCFException { - return outputStream; - } - - @Override - public long getBinaryLength() throws ManifoldCFException { - return outputStream.size(); - } - - @Override - public InputStream getInputStream() throws ManifoldCFException { - return new ByteArrayInputStream(outputStream.toByteArray()); - } - - @Override - public void close() throws ManifoldCFException { - } - -} diff --git a/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_en_US.properties b/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_en_US.properties deleted file mode 100644 index 13afb69d8..000000000 --- a/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_en_US.properties +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#LLM TRANSO CONNECTOR - CONFIGURATION -llm.TabName=LLM Transformation Connector -llm.typeOfLlm=Type of LLM -llm.option.datafariws=Datafari External Webservices -llm.option.openai=OpenAI API or similar -llm.endPointToUse=LLM API endpoint -llm.llmToUse=Model you want to use for cathegorization or summarization (leave empty to use the default model) -llm.embeddingsModelToUse=Model you want to use for vector embedding (leave empty to use the default model) -llm.apiKey=API key (if required) -llm.EnableSummarize=Enable LLM summarization -llm.EnableCategorize=Enable LLM categorization -llm.EnableEmbeddings=Enable vector embeddings -llm.MaxTokens=Max tokens for summaries -llm.SummariesLanguage=Summaries language -llm.vectorDimensions=Vector dimensions \ No newline at end of file diff --git a/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_es_ES.properties b/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_es_ES.properties deleted file mode 100644 index 26f4b110d..000000000 --- a/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_es_ES.properties +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#LLM TRANSO CONNECTOR - CONFIGURATION -llm.TabName=LLM Transformation Connector -llm.typeOfLlm=Type of LLM -llm.option.datafariws=Datafari External Webservices -llm.option.openai=OpenAI API or similar -llm.endPointToUse=LLM API endpoint -llm.llmToUse=Model you want to use for cathegorization or summarization (leave empty to use the default model) -llm.embeddingsModelToUse=Model you want to use for vector embedding (leave empty to use the default model) -llm.apiKey=API key (if required) -llm.EnableSummarize=Enable LLM summarization -llm.EnableCategorize=Enable LLM categorization -llm.EnableEmbeddings=Enable vector embeddings -llm.MaxTokens=Max tokens for summaries -llm.SummariesLanguage=Summaries language -llm.vectorDimensions=Vector dimensions diff --git a/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_fr_FR.properties b/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_fr_FR.properties deleted file mode 100644 index 26f4b110d..000000000 --- a/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_fr_FR.properties +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#LLM TRANSO CONNECTOR - CONFIGURATION -llm.TabName=LLM Transformation Connector -llm.typeOfLlm=Type of LLM -llm.option.datafariws=Datafari External Webservices -llm.option.openai=OpenAI API or similar -llm.endPointToUse=LLM API endpoint -llm.llmToUse=Model you want to use for cathegorization or summarization (leave empty to use the default model) -llm.embeddingsModelToUse=Model you want to use for vector embedding (leave empty to use the default model) -llm.apiKey=API key (if required) -llm.EnableSummarize=Enable LLM summarization -llm.EnableCategorize=Enable LLM categorization -llm.EnableEmbeddings=Enable vector embeddings -llm.MaxTokens=Max tokens for summaries -llm.SummariesLanguage=Summaries language -llm.vectorDimensions=Vector dimensions diff --git a/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_ja_JP.properties b/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_ja_JP.properties deleted file mode 100644 index 26f4b110d..000000000 --- a/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_ja_JP.properties +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#LLM TRANSO CONNECTOR - CONFIGURATION -llm.TabName=LLM Transformation Connector -llm.typeOfLlm=Type of LLM -llm.option.datafariws=Datafari External Webservices -llm.option.openai=OpenAI API or similar -llm.endPointToUse=LLM API endpoint -llm.llmToUse=Model you want to use for cathegorization or summarization (leave empty to use the default model) -llm.embeddingsModelToUse=Model you want to use for vector embedding (leave empty to use the default model) -llm.apiKey=API key (if required) -llm.EnableSummarize=Enable LLM summarization -llm.EnableCategorize=Enable LLM categorization -llm.EnableEmbeddings=Enable vector embeddings -llm.MaxTokens=Max tokens for summaries -llm.SummariesLanguage=Summaries language -llm.vectorDimensions=Vector dimensions diff --git a/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_zh_CN.properties b/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_zh_CN.properties deleted file mode 100644 index 26f4b110d..000000000 --- a/datafari-llm-connector/src/main/native2ascii/com/francelabs/datafari/transformation/llm/common_zh_CN.properties +++ /dev/null @@ -1,30 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#LLM TRANSO CONNECTOR - CONFIGURATION -llm.TabName=LLM Transformation Connector -llm.typeOfLlm=Type of LLM -llm.option.datafariws=Datafari External Webservices -llm.option.openai=OpenAI API or similar -llm.endPointToUse=LLM API endpoint -llm.llmToUse=Model you want to use for cathegorization or summarization (leave empty to use the default model) -llm.embeddingsModelToUse=Model you want to use for vector embedding (leave empty to use the default model) -llm.apiKey=API key (if required) -llm.EnableSummarize=Enable LLM summarization -llm.EnableCategorize=Enable LLM categorization -llm.EnableEmbeddings=Enable vector embeddings -llm.MaxTokens=Max tokens for summaries -llm.SummariesLanguage=Summaries language -llm.vectorDimensions=Vector dimensions diff --git a/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/editConfiguration.js b/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/editConfiguration.js deleted file mode 100644 index bddf5ec05..000000000 --- a/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/editConfiguration.js +++ /dev/null @@ -1,27 +0,0 @@ - - - diff --git a/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/editConfiguration_llm.html b/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/editConfiguration_llm.html deleted file mode 100644 index 54c46947b..000000000 --- a/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/editConfiguration_llm.html +++ /dev/null @@ -1,88 +0,0 @@ - - -#if($TabName == $ResourceBundle.getString('llm.TabName')) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

$Encoder.bodyEscape($ResourceBundle.getString('llm.typeOfLlm')) - - -
- $Encoder.bodyEscape($ResourceBundle.getString('llm.endPointToUse')) - - -
- $Encoder.bodyEscape($ResourceBundle.getString('llm.llmToUse')) - - -
- $Encoder.bodyEscape($ResourceBundle.getString('llm.embeddingsModelToUse')) - - -
- $Encoder.bodyEscape($ResourceBundle.getString('llm.apiKey')) - - -
- $Encoder.bodyEscape($ResourceBundle.getString('llm.vectorDimensions')) - - -
-#else - - - - - - - - -#end \ No newline at end of file diff --git a/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/editSpecification.js b/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/editSpecification.js deleted file mode 100644 index eee71137d..000000000 --- a/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/editSpecification.js +++ /dev/null @@ -1,26 +0,0 @@ - - - diff --git a/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/editSpecification_llm.html b/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/editSpecification_llm.html deleted file mode 100644 index c11c97e88..000000000 --- a/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/editSpecification_llm.html +++ /dev/null @@ -1,76 +0,0 @@ - - -#if($TABNAME == $ResourceBundle.getString('llm.TabName') && ${SEQNUM} == ${SELECTEDNUM}) - - - - - - - - - - - - - - - - - - - - - - - - - - - -

$Encoder.bodyEscape($ResourceBundle.getString('llm.EnableSummarize')) -
$Encoder.bodyEscape($ResourceBundle.getString('llm.EnableCategorize')) -
$Encoder.bodyEscape($ResourceBundle.getString('llm.EnableEmbeddings')) -
$Encoder.bodyEscape($ResourceBundle.getString('llm.MaxTokens')) -
$Encoder.bodyEscape($ResourceBundle.getString('llm.SummariesLanguage')) - -
- -#else - - - - - - -#end \ No newline at end of file diff --git a/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/viewConfiguration.html b/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/viewConfiguration.html deleted file mode 100644 index c24a3a66f..000000000 --- a/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/viewConfiguration.html +++ /dev/null @@ -1,44 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
$Encoder.bodyEscape($ResourceBundle.getString('llm.endPointToUse'))$Encoder.bodyEscape($ENDPOINT)
$Encoder.bodyEscape($ResourceBundle.getString('llm.typeOfLlm'))$Encoder.bodyEscape($LLMTYPE)
$Encoder.bodyEscape($ResourceBundle.getString('llm.embeddingsModelToUse'))$Encoder.bodyEscape($EMBEDDINGMODEL)
$Encoder.bodyEscape($ResourceBundle.getString('llm.llmToUse'))$Encoder.bodyEscape($LLM)
$Encoder.bodyEscape($ResourceBundle.getString('llm.apiKey'))$Encoder.bodyEscape($APIKEY)
$Encoder.bodyEscape($ResourceBundle.getString('llm.vectorDimensions'))$Encoder.bodyEscape($DIMENSIONS)
diff --git a/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/viewSpecification.html b/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/viewSpecification.html deleted file mode 100644 index 1864c47b5..000000000 --- a/datafari-llm-connector/src/main/resources/com/francelabs/datafari/transformation/llm/viewSpecification.html +++ /dev/null @@ -1,54 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - -

$Encoder.bodyEscape($ResourceBundle.getString('llm.EnableSummarize')) - #if($ENABLESUMMARIZE) - - #end -
$Encoder.bodyEscape($ResourceBundle.getString('llm.EnableCategorize')) - #if($ENABLECATEGORIZE) - - #end -
$Encoder.bodyEscape($ResourceBundle.getString('llm.EnableEmbeddings')) - #if($ENABLEEMBEDDINGS) - - #end -

$Encoder.bodyEscape($ResourceBundle.getString('llm.MaxTokens'))$Encoder.bodyEscape($MAXTOKENS)
$Encoder.bodyEscape($ResourceBundle.getString('llm.SummariesLanguage'))$Encoder.bodyEscape($SUMMARIESLANGUAGE)
diff --git a/datafari-llm-connector/src/test/java/com/francelabs/datafari/transformation/llm/connectors/OpenAiLlmServiceTest.java b/datafari-llm-connector/src/test/java/com/francelabs/datafari/transformation/llm/connectors/OpenAiLlmServiceTest.java deleted file mode 100644 index 416beb054..000000000 --- a/datafari-llm-connector/src/test/java/com/francelabs/datafari/transformation/llm/connectors/OpenAiLlmServiceTest.java +++ /dev/null @@ -1,38 +0,0 @@ -package com.francelabs.datafari.transformation.llm.connectors; - -import com.francelabs.datafari.transformation.llm.model.LlmSpecification; -import org.junit.jupiter.api.Assertions; - - -import static org.junit.jupiter.api.Assertions.*; -class OpenAiLlmServiceTest { - - @org.junit.jupiter.api.Test - void embeddings() { - - LlmSpecification spec = new LlmSpecification(); - spec.setEnableVectorEmbedding(true); - String apikey = "sk-xxxxxxxxxxxxxxxxxxxxxxx"; - spec.setApiKey(apikey); - spec.setLlmEndpoint("https://api.openai.com/v1/"); - spec.setEmbeddingsModel("text-embedding-3-small"); - spec.setVectorDimension(124); - LlmService service = new OpenAiLlmService(spec); - - try { - float[] vector = service.embeddings("« Canard » est un terme générique qui désigne des oiseaux aquatiques ansériformes, au cou court, au large bec jaune aplati, aux très courtes pattes palmées et aux longues ailes pointues, domestiqués ou non"); - System.out.println(vector); - Assertions.assertTrue(vector.length > 100); - } catch (Exception e) { - System.out.println("Error in OpenAiLlmServiceTest.embeddings()" + e.getLocalizedMessage()); - } - } - - @org.junit.jupiter.api.Test - void summarize() { - } - - @org.junit.jupiter.api.Test - void categorize() { - } -} \ No newline at end of file diff --git a/datafari-mcf-connectors-dependencies/pom.xml b/datafari-mcf-connectors-dependencies/pom.xml index 2b5464688..1f0817c92 100644 --- a/datafari-mcf-connectors-dependencies/pom.xml +++ b/datafari-mcf-connectors-dependencies/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-mcf-connectors-dependencies @@ -20,7 +20,7 @@ com.francelabs.datafari datafari-core - 6.1-dev-Community + 6.2-dev-Community org.apache.manifoldcf diff --git a/datafari-mcf-scripts/pom.xml b/datafari-mcf-scripts/pom.xml index 952418df6..6588f72d7 100644 --- a/datafari-mcf-scripts/pom.xml +++ b/datafari-mcf-scripts/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-mcf-scripts @@ -14,7 +14,7 @@ com.francelabs.datafari datafari-core - 6.1-dev-Community + 6.2-dev-Community provided pom diff --git a/datafari-mcf/pom.xml b/datafari-mcf/pom.xml index 4c9a23399..7e11dc30f 100755 --- a/datafari-mcf/pom.xml +++ b/datafari-mcf/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-mcf diff --git a/datafari-metadatacleaner-connector/pom.xml b/datafari-metadatacleaner-connector/pom.xml index aa8c0664d..112a4243e 100755 --- a/datafari-metadatacleaner-connector/pom.xml +++ b/datafari-metadatacleaner-connector/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-metadatacleaner-connector @@ -14,7 +14,7 @@ com.francelabs.datafari datafari-mcf-connectors-dependencies - 6.1-dev-Community + 6.2-dev-Community pom provided diff --git a/datafari-regex-entity-connector/pom.xml b/datafari-regex-entity-connector/pom.xml index 6423bec5e..b5a4483ef 100644 --- a/datafari-regex-entity-connector/pom.xml +++ b/datafari-regex-entity-connector/pom.xml @@ -3,7 +3,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community datafari-regex-entity-connector Datafari Enterprise Search - Regex Entity connector module @@ -13,7 +13,7 @@ com.francelabs.datafari datafari-mcf-connectors-dependencies - 6.1-dev-Community + 6.2-dev-Community pom provided diff --git a/datafari-share-connector/pom.xml b/datafari-share-connector/pom.xml index 7fc0c00d2..2f3e7df3a 100755 --- a/datafari-share-connector/pom.xml +++ b/datafari-share-connector/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-share-connector @@ -14,7 +14,7 @@ com.francelabs.datafari datafari-mcf-connectors-dependencies - 6.1-dev-Community + 6.2-dev-Community pom provided diff --git a/datafari-solr-atomic-update/pom.xml b/datafari-solr-atomic-update/pom.xml index 78601f7ea..e82bd843a 100644 --- a/datafari-solr-atomic-update/pom.xml +++ b/datafari-solr-atomic-update/pom.xml @@ -6,7 +6,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community datafari-solr-atomic-update diff --git a/datafari-solr-connector/pom.xml b/datafari-solr-connector/pom.xml index d12b5c2a4..061c5e7e9 100644 --- a/datafari-solr-connector/pom.xml +++ b/datafari-solr-connector/pom.xml @@ -6,7 +6,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community datafari-solr-connector @@ -17,7 +17,7 @@ com.francelabs.datafari datafari-mcf-connectors-dependencies - 6.1-dev-Community + 6.2-dev-Community pom provided diff --git a/datafari-solr/pom.xml b/datafari-solr/pom.xml index ee6fdff05..6920668f8 100644 --- a/datafari-solr/pom.xml +++ b/datafari-solr/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-solr diff --git a/datafari-spacy-fastapi-connector/pom.xml b/datafari-spacy-fastapi-connector/pom.xml index f0d436798..aa2a0cb42 100755 --- a/datafari-spacy-fastapi-connector/pom.xml +++ b/datafari-spacy-fastapi-connector/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-spacy-fastapi-connector @@ -14,7 +14,7 @@ com.francelabs.datafari datafari-mcf-connectors-dependencies - 6.1-dev-Community + 6.2-dev-Community pom provided diff --git a/datafari-tika-server/pom.xml b/datafari-tika-server/pom.xml index ea6ab1ac3..f52c3a486 100644 --- a/datafari-tika-server/pom.xml +++ b/datafari-tika-server/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-tika-server diff --git a/datafari-tomcat/pom.xml b/datafari-tomcat/pom.xml index ff94b2735..2abfbb974 100644 --- a/datafari-tomcat/pom.xml +++ b/datafari-tomcat/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-tomcat diff --git a/datafari-ui/pom.xml b/datafari-ui/pom.xml index d4efd8d50..2f9d5cba5 100644 --- a/datafari-ui/pom.xml +++ b/datafari-ui/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-ui diff --git a/datafari-updateprocessor/pom.xml b/datafari-updateprocessor/pom.xml index 530358ef7..f9d1f9545 100644 --- a/datafari-updateprocessor/pom.xml +++ b/datafari-updateprocessor/pom.xml @@ -2,7 +2,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-updateprocessor @@ -44,7 +44,7 @@ com.francelabs.datafari datafari-jena - 6.1-dev-Community + 6.2-dev-Community pom provided diff --git a/datafari-web-connector/pom.xml b/datafari-web-connector/pom.xml index 0ec67528c..53507c08f 100755 --- a/datafari-web-connector/pom.xml +++ b/datafari-web-connector/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-web-connector @@ -14,7 +14,7 @@ com.francelabs.datafari datafari-mcf-connectors-dependencies - 6.1-dev-Community + 6.2-dev-Community pom provided diff --git a/datafari-webapp/pom.xml b/datafari-webapp/pom.xml index de8c92a15..b2288704d 100644 --- a/datafari-webapp/pom.xml +++ b/datafari-webapp/pom.xml @@ -2,7 +2,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-webapp @@ -12,7 +12,7 @@ com.francelabs.datafari datafari-core - 6.1-dev-Community + 6.2-dev-Community @@ -63,7 +63,7 @@ com.francelabs.datafari datafari-mcf-scripts - 6.1-dev-Community + 6.2-dev-Community diff --git a/datafari-zookeeper/pom.xml b/datafari-zookeeper/pom.xml index 07d81e8f1..ef14c25bb 100644 --- a/datafari-zookeeper/pom.xml +++ b/datafari-zookeeper/pom.xml @@ -4,7 +4,7 @@ com.francelabs.datafari datafari-ce - 6.1-dev-Community + 6.2-dev-Community 4.0.0 datafari-zookeeper diff --git a/linux/build.xml b/linux/build.xml index bb7709c3f..1278aee55 100644 --- a/linux/build.xml +++ b/linux/build.xml @@ -32,7 +32,6 @@ - @@ -442,12 +441,6 @@ - - - - - - diff --git a/linux/installer/DEBIAN/control b/linux/installer/DEBIAN/control index b9740ba2f..5c5ec7bb2 100755 --- a/linux/installer/DEBIAN/control +++ b/linux/installer/DEBIAN/control @@ -1,5 +1,5 @@ Package: datafari -Version: 6.1-dev +Version: 6.2-dev Section: base Priority: optional Architecture: all diff --git a/pom.xml b/pom.xml index e32836129..a46a62e8a 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ datafari-ce pom - 6.1-dev-Community + 6.2-dev-Community Datafari Enterprise Search CE - Parent POM 1.0 @@ -16,7 +16,7 @@ 4.1.3 393443a5b9849645362df2f7536e734e1fc6d513aadf440fab8dda3063553394a138805098796d35f9d4d31e2899ecab630a2ec2a44d00d5e63bed549a2e844c 4.17.0 - 6.1-dev-Community + 6.2-dev-Community 1.5 3eb37c4a9c013275b44db5ddaec10143 26.0-jre @@ -88,7 +88,6 @@ datafari-adminui datafari-git-plugin datafari-regex-entity-connector - datafari-llm-connector datafari-active-directory-connector datafari-solr-atomic-update datafari-confluence-v6-connector