Create new examples for OCR -> Extract Text workflow

Adds new examples to show how to pair the new OCR tool with the Extract Text tool.
datalogics · Jul 26, 2024 · cfc5b7c · cfc5b7c
1 parent bed956c
commit cfc5b7c
Show file tree

Hide file tree

Showing 6 changed files with 413 additions and 0 deletions.
diff --git a/DotNET/Complex Flow Examples/ocr-with-extract-text.cs b/DotNET/Complex Flow Examples/ocr-with-extract-text.cs
@@ -0,0 +1,62 @@
+using Newtonsoft.Json.Linq;
+using System;
+using System.IO;
+using System.Net.Http;
+using System.Text;
+using System.Threading.Tasks;
+
+class Program
+{
+    private static readonly string apiKey = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"; // Your API key here
+
+    static async Task Main(string[] args)
+    {
+        using (var httpClient = new HttpClient { BaseAddress = new Uri("https://api.pdfrest.com") })
+        {
+            // Upload PDF for OCR
+            using var ocrRequest = new HttpRequestMessage(HttpMethod.Post, "pdf-with-ocr-text");
+
+            ocrRequest.Headers.TryAddWithoutValidation("Api-Key", apiKey);
+            ocrRequest.Headers.Accept.Add(new System.Net.Http.Headers.MediaTypeWithQualityHeaderValue("application/json"));
+            var ocrMultipartContent = new MultipartFormDataContent();
+
+            var pdfByteArray = File.ReadAllBytes("/path/to/file.pdf");
+            var pdfByteArrayContent = new ByteArrayContent(pdfByteArray);
+            ocrMultipartContent.Add(pdfByteArrayContent, "file", "file.pdf");
+            pdfByteArrayContent.Headers.TryAddWithoutValidation("Content-Type", "application/pdf");
+            ocrMultipartContent.Add(new StringContent("example_pdf-with-ocr-text_out"), "output");
+
+            ocrRequest.Content = ocrMultipartContent;
+            var ocrResponse = await httpClient.SendAsync(ocrRequest);
+
+            var ocrResult = await ocrResponse.Content.ReadAsStringAsync();
+            Console.WriteLine("OCR response received.");
+            Console.WriteLine(ocrResult);
+
+            dynamic ocrResponseData = JObject.Parse(ocrResult);
+            string ocrPDFID = ocrResponseData.outputId;
+
+            // Extract text from OCR'd PDF
+            using var extractTextRequest = new HttpRequestMessage(HttpMethod.Post, "extracted-text");
+
+            extractTextRequest.Headers.TryAddWithoutValidation("Api-Key", apiKey);
+            extractTextRequest.Headers.Accept.Add(new System.Net.Http.Headers.MediaTypeWithQualityHeaderValue("application/json"));
+            var extractTextMultipartContent = new MultipartFormDataContent();
+
+            extractTextMultipartContent.Add(new StringContent(ocrPDFID), "id");
+
+            extractTextRequest.Content = extractTextMultipartContent;
+            var extractTextResponse = await httpClient.SendAsync(extractTextRequest);
+
+            var extractTextResult = await extractTextResponse.Content.ReadAsStringAsync();
+            Console.WriteLine("Extract text response received.");
+            Console.WriteLine(extractTextResult);
+
+            dynamic extractTextResponseData = JObject.Parse(extractTextResult);
+            string fullText = extractTextResponseData.fullText;
+
+            Console.WriteLine("Extracted text:");
+            Console.WriteLine(fullText);
+        }
+    }
+}
diff --git a/Java/Complex Flow Examples/OcrWithExtractText.java b/Java/Complex Flow Examples/OcrWithExtractText.java
@@ -0,0 +1,106 @@
+import io.github.cdimascio.dotenv.Dotenv;
+import java.io.File;
+import java.io.IOException;
+import java.util.concurrent.TimeUnit;
+import okhttp3.*;
+import org.json.JSONObject;
+
+/* In this sample, we will show how to convert a scanned document into a PDF with
+ * searchable and extractable text using Optical Character Recognition (OCR), and then
+ * extract that text from the newly created document.
+ *
+ * First, we will upload a scanned PDF to the /pdf-with-ocr-text route and capture the
+ * output ID. Then, we will send the output ID to the /extracted-text route, which will
+ * return the newly added text.
+ */
+
+public class OcrWithExtractText {
+
+    // Specify the path to your PDF file here, or as the first argument when running the program.
+    private static final String DEFAULT_PDF_FILE_PATH = "/path/to/file.pdf";
+
+    // Specify your API key here, or in the environment variable PDFREST_API_KEY.
+    // You can also put the environment variable in a .env file.
+    private static final String DEFAULT_API_KEY = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";
+
+    public static void main(String[] args) {
+        File pdfFile;
+        if (args.length > 0) {
+            pdfFile = new File(args[0]);
+        } else {
+            pdfFile = new File(DEFAULT_PDF_FILE_PATH);
+        }
+
+        final Dotenv dotenv = Dotenv.configure().ignoreIfMalformed().ignoreIfMissing().load();
+
+        final RequestBody pdfFileRequestBody =
+                RequestBody.create(pdfFile, MediaType.parse("application/pdf"));
+        RequestBody ocrRequestBody =
+                new MultipartBody.Builder()
+                        .setType(MultipartBody.FORM)
+                        .addFormDataPart("file", pdfFile.getName(), pdfFileRequestBody)
+                        .addFormDataPart("output", "example_pdf-with-ocr-text_out")
+                        .build();
+        Request ocrRequest =
+                new Request.Builder()
+                        .header("Api-Key", dotenv.get("PDFREST_API_KEY", DEFAULT_API_KEY))
+                        .url("https://api.pdfrest.com/pdf-with-ocr-text")
+                        .post(ocrRequestBody)
+                        .build();
+        try {
+            OkHttpClient ocrClient =
+                    new OkHttpClient().newBuilder().readTimeout(60, TimeUnit.SECONDS).build();
+
+            Response ocrResponse = ocrClient.newCall(ocrRequest).execute();
+
+            System.out.println("Response status code: " + ocrResponse.code());
+            if (ocrResponse.body() != null) {
+                String ocrResponseString = ocrResponse.body().string();
+
+                JSONObject ocrJSON = new JSONObject(ocrResponseString);
+                if (ocrJSON.has("error")) {
+                    System.out.println("Error during OCR call: " + ocrResponseString);
+                    return;
+                }
+
+                String ocrPDFID = ocrJSON.get("outputId").toString();
+                System.out.println("Got the output ID: " + ocrPDFID);
+
+                RequestBody extractRequestBody =
+                        new MultipartBody.Builder()
+                                .setType(MultipartBody.FORM)
+                                .addFormDataPart("id", ocrPDFID)
+                                .build();
+                Request extractRequest =
+                        new Request.Builder()
+                                .header("Api-Key", dotenv.get("PDFREST_API_KEY", DEFAULT_API_KEY))
+                                .url("https://api.pdfrest.com/extracted-text")
+                                .post(extractRequestBody)
+                                .build();
+                try {
+                    OkHttpClient extractClient =
+                            new OkHttpClient().newBuilder().readTimeout(60, TimeUnit.SECONDS).build();
+
+                    Response extractResponse = extractClient.newCall(extractRequest).execute();
+
+                    System.out.println("Response status code: " + extractResponse.code());
+                    if (extractResponse.body() != null) {
+                        String extractResponseString = extractResponse.body().string();
+
+                        JSONObject extractJSON = new JSONObject(extractResponseString);
+                        if (extractJSON.has("error")) {
+                            System.out.println("Error during text extraction call: " + extractResponseString);
+                            return;
+                        }
+
+                        System.out.println(extractJSON.getString("fullText"));
+                    }
+                } catch (IOException e) {
+                    throw new RuntimeException(e);
+                }
+            }
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+}
diff --git a/JavaScript/Complex Flow Examples/ocr-with-extract-text.js b/JavaScript/Complex Flow Examples/ocr-with-extract-text.js
@@ -0,0 +1,74 @@
+var axios = require("axios");
+var FormData = require("form-data");
+var fs = require("fs");
+
+/* In this sample, we will show how to convert a scanned document into a PDF with
+* searchable and extractable text using Optical Character Recognition (OCR), and then
+* extract that text from the newly created document.
+*
+* First, we will upload a scanned PDF to the /pdf-with-ocr-text route and capture the
+* output ID. Then, we will send the output ID to the /extracted-text route, which will
+* return the newly added text.
+*/
+
+var apiKey = "xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"; // Replace with your API key
+
+var ocrData = new FormData();
+ocrData.append("file", fs.createReadStream("/path/to/file.pdf"), "file_name.pdf");
+ocrData.append("output", "example_pdf-with-ocr-text_out");
+
+var ocrConfig = {
+  method: "post",
+  maxBodyLength: Infinity,
+  url: "https://api.pdfrest.com/pdf-with-ocr-text",
+  headers: {
+    "Api-Key": apiKey,
+    ...ocrData.getHeaders(),
+  },
+  data: ocrData,
+};
+
+console.log("Sending POST request to OCR endpoint...");
+axios(ocrConfig)
+  .then(function (response) {
+    console.log("Response status code: " + response.status);
+
+    if (response.status === 200) {
+      var ocrPDFID = response.data.outputId;
+      console.log("Got the output ID: " + ocrPDFID);
+
+      var extractData = new FormData();
+      extractData.append("id", ocrPDFID);
+
+      var extractConfig = {
+        method: "post",
+        maxBodyLength: Infinity,
+        url: "https://api.pdfrest.com/extracted-text",
+        headers: {
+          "Api-Key": apiKey,
+          ...extractData.getHeaders(),
+        },
+        data: extractData,
+      };
+
+      console.log("Sending POST request to extract text endpoint...");
+      axios(extractConfig)
+        .then(function (extractResponse) {
+          console.log("Response status code: " + extractResponse.status);
+
+          if (extractResponse.status === 200) {
+            console.log(extractResponse.data.fullText);
+          } else {
+            console.log(extractResponse.data);
+          }
+        })
+        .catch(function (error) {
+          console.log(error.response ? error.response.data : error.message);
+        });
+    } else {
+      console.log(response.data);
+    }
+  })
+  .catch(function (error) {
+    console.log(error.response ? error.response.data : error.message);
+  });
diff --git a/PHP/Complex Flow Examples/ocr-with-extract-text.php b/PHP/Complex Flow Examples/ocr-with-extract-text.php
@@ -0,0 +1,72 @@
+<?php
+
+require 'vendor/autoload.php';
+
+use GuzzleHttp\Client;
+use GuzzleHttp\Psr7\Request;
+use GuzzleHttp\Psr7\Utils;
+
+/* In this sample, we will show how to convert a scanned document into a PDF with
+ * searchable and extractable text using Optical Character Recognition (OCR), and then
+ * extract that text from the newly created document.
+ *
+ * First, we will upload a scanned PDF to the /pdf-with-ocr-text route and capture the
+ * output ID. Then, we will send the output ID to the /extracted-text route, which will
+ * return the newly added text.
+ */
+
+$client = new Client();
+
+$headers = [
+  'Api-Key' => 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' // Replace with your API key
+];
+
+// Upload PDF for OCR
+$pdfToOCROptions = [
+  'multipart' => [
+    [
+      'name' => 'file',
+      'contents' => Utils::tryFopen('/path/to/file.pdf', 'r'),
+      'filename' => 'file.pdf',
+      'headers' => [
+        'Content-Type' => 'application/pdf'
+      ]
+    ],
+    [
+      'name' => 'output',
+      'contents' => 'example_pdf-with-ocr-text_out'
+    ]
+  ]
+];
+
+$pdfToOCRRequest = new Request('POST', 'https://api.pdfrest.com/pdf-with-ocr-text', $headers);
+
+echo "Sending POST request to OCR endpoint...\n";
+$pdfToOCRResponse = $client->sendAsync($pdfToOCRRequest, $pdfToOCROptions)->wait();
+
+echo "Response status code: " . $pdfToOCRResponse->getStatusCode() . "\n";
+
+$ocrPDFID = json_decode($pdfToOCRResponse->getBody())->outputId;
+echo "Got the output ID: " . $ocrPDFID . "\n";
+
+// Extract text from OCR'd PDF
+$extractTextOptions = [
+  'multipart' => [
+    [
+      'name' => 'id',
+      'contents' => $ocrPDFID
+    ]
+  ]
+];
+
+$extractTextRequest = new Request('POST', 'https://api.pdfrest.com/extracted-text', $headers);
+
+echo "Sending POST request to extract text endpoint...\n";
+$extractTextResponse = $client->sendAsync($extractTextRequest, $extractTextOptions)->wait();
+
+echo "Response status code: " . $extractTextResponse->getStatusCode() . "\n";
+
+$fullText = json_decode($extractTextResponse->getBody())->fullText;
+echo $fullText . "\n";
+
+?>
diff --git a/Python/Complex Flow Examples/ocr-with-extract-text.py b/Python/Complex Flow Examples/ocr-with-extract-text.py
@@ -0,0 +1,67 @@
+from requests_toolbelt import MultipartEncoder
+import requests
+
+
+# In this sample, we will show how to convert a scanned document into a PDF with
+# searchable and extractable text using Optical Character Recognition (OCR), and then
+# extract that text from the newly created document.
+#
+# First, we will upload a scanned PDF to the /pdf-with-ocr-text route and capture the
+# output ID. Then, we will send the output ID to the /extracted-text route, which will
+# return the newly added text.
+
+api_key = 'xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' # place your api key here
+
+ocr_endpoint_url = 'https://api.pdfrest.com/pdf-with-ocr-text'
+mp_encoder_pdf = MultipartEncoder(
+    fields={
+        'file': ('file_name.pdf', open('/path/to/file.pdf', 'rb'), 'application/pdf'),
+        'output': 'example_pdf-with-ocr-text_out',
+    }
+)
+
+image_headers = {
+    'Accept': 'application/json',
+    'Content-Type': mp_encoder_pdf.content_type,
+    'Api-Key': api_key
+}
+
+print("Sending POST request to OCR endpoint...")
+response = requests.post(ocr_endpoint_url, data=mp_encoder_pdf, headers=image_headers)
+
+print("Response status code: " + str(response.status_code))
+
+if response.ok:
+    response_json = response.json()
+    ocr_pdf_id = response_json["outputId"]
+    print("Got the output ID: " + ocr_pdf_id)
+
+    extract_endpoint_url = 'https://api.pdfrest.com/extracted-text'
+
+    mp_encoder_extract_text = MultipartEncoder(
+        fields={
+            'id': ocr_pdf_id
+        }
+    )
+
+    extract_text_headers = {
+        'Accept': 'application/json',
+        'Content-Type': mp_encoder_extract_text.content_type,
+        'Api-Key': api_key
+    }
+
+    print("Sending POST request to extract text endpoint...")
+    extract_response = requests.post(extract_endpoint_url, data=mp_encoder_extract_text, headers=extract_text_headers)
+
+    print("Response status code: " + str(extract_response.status_code))
+
+    if extract_response.ok:
+        extract_json = extract_response.json()
+        print(extract_json["fullText"])
+
+    else:
+        print(extract_response.text)
+
+
+else:
+    print(response.text)