diff --git a/build.sh b/build.sh index 7a23b1e..3a06065 100755 --- a/build.sh +++ b/build.sh @@ -5,6 +5,11 @@ if [ -z "$version" ]; then read -p "Enter new version number: " version fi - -rm -f zotero-ocr-${version}.xpi -zip -r zotero-ocr-${version}.xpi chrome/* defaults/* chrome.manifest install.rdf +if [ -d build ]; then + if [ -f build/*.xpi ]; then + rm -f build/*.xpi + fi +else + mkdir build +fi +cd src && zip -r ../build/zotero-ocr-${version}.xpi * -x "*.DS_Store" && cd .. diff --git a/chrome.manifest b/chrome.manifest deleted file mode 100644 index e930f02..0000000 --- a/chrome.manifest +++ /dev/null @@ -1,7 +0,0 @@ -content zoteroocr chrome/content/ - -overlay chrome://zotero/content/zoteroPane.xul chrome://zoteroocr/content/overlay.xul - -locale zoteroocr en-US chrome/locale/en-US/ - -skin zoteroocr default chrome/skin/default/zoteroocr/ diff --git a/chrome/content/overlay.xul b/chrome/content/overlay.xul deleted file mode 100644 index 528b12f..0000000 --- a/chrome/content/overlay.xul +++ /dev/null @@ -1,32 +0,0 @@ - - - - - - - \n\n'; - Zotero.File.putContents(htmlfile, pagecontent); - yield Zotero.Attachments.linkFromFile({ - file: OS.Path.join(dir, pagename), - contentType: "text/html", - parentItemID: item.id - }); - } - } - - // attach PDF if it is a new one - if (Zotero.Prefs.get("zoteroocr.outputPDF") && !(Zotero.Prefs.get("zoteroocr.overwritePDF"))) { - yield Zotero.Attachments.linkFromFile({ - file: ocrbase + '.pdf', - parentItemID: item.id - }); - } - - if (!Zotero.Prefs.get("zoteroocr.outputPNG") && imageListArray) { - // delete image list - yield Zotero.File.removeIfExists(imageList); - // delete PNGs - for (let imageName of imageListArray) { - yield Zotero.File.removeIfExists(imageName); - } - } - } - }); - -}; diff --git a/chrome/locale/en-US/zoteroocr.dtd b/chrome/locale/en-US/zoteroocr.dtd deleted file mode 100644 index 256f175..0000000 --- a/chrome/locale/en-US/zoteroocr.dtd +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/chrome/locale/en-US/zoteroocr.properties b/chrome/locale/en-US/zoteroocr.properties deleted file mode 100644 index e69de29..0000000 diff --git a/install.rdf b/install.rdf deleted file mode 100644 index 035a007..0000000 --- a/install.rdf +++ /dev/null @@ -1,34 +0,0 @@ - - - - chrome://zoteroocr/content/preferences.xul - 2 - - - zotero@chnm.gmu.edu - 5.0 - 6.* - - - - - juris-m@juris-m.github.io - 5.0 - 6.* - - - - diff --git a/screenshots/Zotero-OCR-Preferences.png b/screenshots/Zotero-OCR-Preferences.png index c9773c4..33e3f3a 100644 Binary files a/screenshots/Zotero-OCR-Preferences.png and b/screenshots/Zotero-OCR-Preferences.png differ diff --git a/src/bootstrap.js b/src/bootstrap.js new file mode 100644 index 0000000..a6fa87d --- /dev/null +++ b/src/bootstrap.js @@ -0,0 +1,41 @@ +var ZoteroOCR; + +function log(msg) { + Zotero.debug("Zotero OCR: " + msg); +} + +function install() { + log("Installed"); +} + +async function startup({ id, version, rootURI }) { + log("Starting"); + + Zotero.PreferencePanes.register({ + pluginID: 'zotero-ocr@bib.uni-mannheim.de', + src: rootURI + 'prefs.xhtml', + //scripts: [rootURI + 'prefs.js'] + }); + + Services.scriptloader.loadSubScript(rootURI + 'zotero-ocr.js'); + ZoteroOCR.init({ id, version, rootURI }); + ZoteroOCR.addToAllWindows(); +} + +function onMainWindowLoad({ window }) { + ZoteroOCR.addToWindow(window); +} + +function onMainWindowUnload({ window }) { + ZoteroOCR.removeFromWindow(window); +} + +function shutdown() { + log("Shutting down"); + ZoteroOCR.removeFromAllWindows(); + ZoteroOCR = undefined; +} + +function uninstall() { + log("Uninstalled"); +} diff --git a/src/locale/en-US/zotero-ocr.ftl b/src/locale/en-US/zotero-ocr.ftl new file mode 100644 index 0000000..95e2abf --- /dev/null +++ b/src/locale/en-US/zotero-ocr.ftl @@ -0,0 +1,5 @@ +ocr-selected-pdfs = + .label = OCR selected PDF(s) + +ocr-preferences = + .label = Zotero OCR Preferences diff --git a/src/manifest.json b/src/manifest.json new file mode 100644 index 0000000..2e9fe27 --- /dev/null +++ b/src/manifest.json @@ -0,0 +1,19 @@ +{ + "manifest_version": 2, + "name": "Zotero OCR", + "version": "0.6.1", + "description": "Zotero Plugin for OCR", + "author": "Philipp Zumstein", + "icons": { + "32": "ubmannheim32x32.png", + "64": "ubmannheim64x64.png" + }, + "applications": { + "zotero": { + "id": "zotero-ocr@bib.uni-mannheim.de", + "update_url": "https://raw.githubusercontent.com/UB-Mannheim/zotero-ocr/master/updates.json", + "strict_min_version": "6.999", + "strict_max_version": "7.0.*" + } + } +} diff --git a/defaults/preferences/defaults.js b/src/prefs.js similarity index 88% rename from defaults/preferences/defaults.js rename to src/prefs.js index b8d17a2..7238aa6 100644 --- a/defaults/preferences/defaults.js +++ b/src/prefs.js @@ -1,4 +1,3 @@ -// don't set ocrPath, language by default pref("extensions.zotero.zoteroocr.outputNote", true); pref("extensions.zotero.zoteroocr.outputPDF", true); pref("extensions.zotero.zoteroocr.overwritePDF", false); diff --git a/src/prefs.xhtml b/src/prefs.xhtml new file mode 100644 index 0000000..152276d --- /dev/null +++ b/src/prefs.xhtml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + diff --git a/chrome/skin/default/zoteroocr/ubmannheim.png b/src/ubmannheim.png similarity index 100% rename from chrome/skin/default/zoteroocr/ubmannheim.png rename to src/ubmannheim.png diff --git a/chrome/skin/default/zoteroocr/ubmannheim32x32.png b/src/ubmannheim32x32.png similarity index 100% rename from chrome/skin/default/zoteroocr/ubmannheim32x32.png rename to src/ubmannheim32x32.png diff --git a/chrome/skin/default/zoteroocr/ubmannheim64x64.png b/src/ubmannheim64x64.png similarity index 100% rename from chrome/skin/default/zoteroocr/ubmannheim64x64.png rename to src/ubmannheim64x64.png diff --git a/src/zotero-ocr.js b/src/zotero-ocr.js new file mode 100644 index 0000000..136e722 --- /dev/null +++ b/src/zotero-ocr.js @@ -0,0 +1,291 @@ +// zoteroocr.js + +// See https://developer.mozilla.org/en-US/docs/Mozilla/JavaScript_code_modules. +Components.utils.import("resource://gre/modules/FileUtils.jsm"); +Components.utils.import("resource://gre/modules/osfile.jsm"); + + +ZoteroOCR = { + id: null, + version: null, + rootURI: null, + initialized: false, + addedElementIDs: [], + + init({ id, version, rootURI }) { + if (this.initialized) return; + this.id = id; + this.version = version; + this.rootURI = rootURI; + this.initialized = true; + }, + + log(msg) { + Zotero.debug("ZoteroOCR: " + msg); + }, + + addToWindow(window) { + let doc = window.document; + + // Use Fluent for localization + window.MozXULElement.insertFTLIfNeeded("zotero-ocr.ftl"); + + // Add context menu option + let popupmenuitem = doc.createXULElement('menuitem'); + popupmenuitem.id = 'zotero-ocr-item-menu'; + popupmenuitem.class = 'menuitem-iconic zotero-menuitem-ocr' + popupmenuitem.setAttribute('data-l10n-id', 'ocr-selected-pdfs'); + doc.getElementById('zotero-itemmenu').appendChild(popupmenuitem); + popupmenuitem.addEventListener('command', () => { + ZoteroOCR.recognize(window); + }); + this.storeAddedElement(popupmenuitem); + }, + + addToAllWindows() { + var windows = Zotero.getMainWindows(); + for (let win of windows) { + if (!win.ZoteroPane) continue; + this.addToWindow(win); + } + }, + + storeAddedElement(elem) { + if (!elem.id) { + throw new Error("Element must have an id"); + } + this.addedElementIDs.push(elem.id); + }, + + removeFromWindow(window) { + var doc = window.document; + // Remove all elements added to DOM + for (let id of this.addedElementIDs) { + doc.getElementById(id)?.remove(); + } + doc.querySelector('[href="zotero-ocr.ftl"]').remove(); + }, + + removeFromAllWindows() { + var windows = Zotero.getMainWindows(); + for (let win of windows) { + if (!win.ZoteroPane) continue; + this.removeFromWindow(win); + } + }, + + async recognize(window) { + + // Look for the tesseract executable in the settings and at commonly used locations. + // If it is found, the settings are updated. + // Otherwise abort with an alert. + Zotero.debug("entering recognize()"); + let ocrEngine = Zotero.Prefs.get("zoteroocr.ocrPath"); + let found = false; + if (ocrEngine) { + let pathOrFile = FileUtils.File(ocrEngine); + // If a directory is given, then try for the standard name of the tool. + if (pathOrFile.isDirectory()) { + if (Zotero.isWin) { + ocrEngine = OS.Path.join(ocrEngine, "tesseract.exe"); + } + else { + ocrEngine = OS.Path.join(ocrEngine, "tesseract"); + } + Zotero.Prefs.set("zoteroocr.ocrPath", ocrEngine); + } + found = await OS.File.exists(ocrEngine); + } + else { + let path = ["", "/usr/local/bin/", "/usr/bin/", "C:\\Program Files\\Tesseract-OCR\\", "/opt/homebrew/bin/", "/usr/local/homebrew/bin/"]; + for (ocrEngine of path) { + ocrEngine += "tesseract"; + if (Zotero.isWin) { + ocrEngine += ".exe"; + } + let tesseractFound = await OS.File.exists(ocrEngine); + if (tesseractFound) { + found = true; + Zotero.debug("Found " + ocrEngine); + Zotero.Prefs.set("zoteroocr.ocrPath", ocrEngine); + break; + } + Zotero.debug("No " + ocrEngine); + } + } + if (!found) { + window.alert("Tesseract executable not found. Tried: " + ocrEngine); + return; + } + + // See https://developer.mozilla.org/en-US/docs/Archive/Add-ons/Code_snippets/File_I_O#Getting_special_files + // and https://dxr.mozilla.org/mozilla-central/source/xpcom/io/nsDirectoryServiceDefs.h. + let zdir = FileUtils.getDir('GreBinD', []); + + // Look for a specific path in the preferences for pdftoppm + let pdftoppm = Zotero.Prefs.get("zoteroocr.pdftoppmPath"); + if (!pdftoppm) { + // look for pdftoppm in various possible directories + let path = ["", "/usr/local/bin/", "/usr/bin/", "/opt/homebrew/bin/", "/usr/local/homebrew/bin/", zdir.clone()]; + for (pdftoppm of path) { + pdftoppm += "pdftoppm"; + if (Zotero.isWin) { + ocrEngine += ".exe"; + } + let pdftoppmFound = await OS.File.exists(pdftoppm); + if (pdftoppmFound) { + found = true; + Zotero.debug("Found " + pdftoppm); + Zotero.Prefs.set("zoteroocr.pdftoppmPath", pdftoppm); + break; + } + Zotero.debug("No " + pdftoppm); + } + } + if (Zotero.isWin && !(pdftoppm.endsWith(".exe"))) { + pdftoppm = pdftoppm + ".exe"; + } + if (!(await OS.File.exists(pdftoppm))) { + window.alert("No " + pdftoppm + " executable found."); + return; + } + + let items = Zotero.getActiveZoteroPane().getSelectedItems(); + for (let item of items) { + // find the PDF + let pdfItem; + if (item.isAttachment()) { + if (item.isFileAttachment() && item.attachmentContentType == 'application/pdf') { + pdfItem = item; + item = Zotero.Items.get(item.parentItemID); + } + else { + window.alert("Item is attachment but not PDF and will be ignored."); + continue; + } + } + else { + let pdfAttachments = item.getAttachments(false) + .map(itemID => Zotero.Items.get(itemID)) + .filter(att => att.isFileAttachment() && att.attachmentContentType == 'application/pdf'); + if (pdfAttachments.length == 0) { + window.alert("No PDF found for the selected item."); + continue; + } + if (pdfAttachments.length > 1) { + window.alert("There are several PDFs attached to this item. Only the first one will be processed."); + } + pdfItem = pdfAttachments[0]; + } + let pdf = pdfItem.getFilePath(); + let base = pdf.replace(/\.pdf$/, ''); + let dir = OS.Path.dirname(pdf); + // let infofile = dir + '/pdfinfo.txt'; + let ocrbase = Zotero.Prefs.get("zoteroocr.overwritePDF") ? base : base + '.ocr'; + // TODO filter out PDFs which have already a text layer + + // extract images from PDF + let imageList = OS.Path.join(dir, 'image-list.txt'); + if (!(await OS.File.exists(imageList))) { + try { + Zotero.debug("Running " + pdftoppm + ' -png -r 300 ' + pdf + ' ' + dir + '/page'); + await Zotero.Utilities.Internal.exec(pdftoppm, ['-progress', '-png', '-r', 300, pdf, dir + '/page']); + } + catch (e) { + Zotero.logError(e); + } + + var iterator = new OS.File.DirectoryIterator(dir); + var imageListArray = []; + await iterator.forEach(function onEntry(entry) { + Zotero.debug(entry); + if (entry.name.match(/-\d+\.png$/)) { + imageListArray.push(entry.path); + } + }); + Zotero.debug('Files are now:') + Zotero.debug(imageListArray); + + // save the list of images in a separate file + Zotero.File.putContents(Zotero.File.pathToFile(imageList), imageListArray.join('\n')); + + } + + let parameters = [dir + '/image-list.txt']; + parameters.push(ocrbase); + if (Zotero.Prefs.get("zoteroocr.language")) { + parameters.push('-l'); + parameters.push(Zotero.Prefs.get("zoteroocr.language")); + } + parameters.push('txt'); + if (Zotero.Prefs.get("zoteroocr.outputPDF")) { + parameters.push('pdf'); + } + if (Zotero.Prefs.get("zoteroocr.outputHocr")) { + parameters.push('hocr'); + } + try { + Zotero.debug("Running " + ocrEngine + ' ' + parameters.join(' ')); + await Zotero.Utilities.Internal.exec(ocrEngine, parameters); + } + catch (e) { + Zotero.logError(e); + } + + if (Zotero.Prefs.get("zoteroocr.outputNote")) { + let contents = await Zotero.File.getContentsAsync(ocrbase + '.txt'); + contents = contents.replace(/(?:\r\n|\r|\n)/g, '
'); + let newNote = new Zotero.Item('note'); + newNote.setNote(contents); + newNote.parentID = item.id; + await newNote.saveTx(); + } + + if (Zotero.Prefs.get("zoteroocr.outputHocr")) { + let contents = await Zotero.File.getContentsAsync(ocrbase + '.hocr'); + // replace the absolute paths of images with relative ones + let escapedDir = dir.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + let regexp = new RegExp(escapedDir + "/", 'g'); + contents = contents.replace(regexp, ''); + // split content into the preamble and pages + contents = contents.replace("\n", ''); + let parts = contents.split("
\n\n'; + Zotero.File.putContents(htmlfile, pagecontent); + await Zotero.Attachments.linkFromFile({ + file: OS.Path.join(dir, pagename), + contentType: "text/html", + parentItemID: item.id + }); + } + } + + // attach PDF if it is a new one + if (Zotero.Prefs.get("zoteroocr.outputPDF") && !(Zotero.Prefs.get("zoteroocr.overwritePDF"))) { + await Zotero.Attachments.linkFromFile({ + file: ocrbase + '.pdf', + parentItemID: item.id + }); + } + + if (!Zotero.Prefs.get("zoteroocr.outputPNG") && imageListArray) { + // delete image list + await Zotero.File.removeIfExists(imageList); + // delete PNGs + for (let imageName of imageListArray) { + await Zotero.File.removeIfExists(imageName); + } + } + } + } +}; diff --git a/update.rdf b/update.rdf deleted file mode 100644 index 47b98ba..0000000 --- a/update.rdf +++ /dev/null @@ -1,30 +0,0 @@ - - - - - - - - 0.6.0 - - - zotero@chnm.gmu.edu - 5.0 - 6.* - https://github.com/UB-Mannheim/zotero-ocr/releases/download/0.6.0/zotero-ocr-0.6.0.xpi - - - - - juris-m@juris-m.github.io - 5.0 - 6.* - https://github.com/UB-Mannheim/zotero-ocr/releases/download/0.6.0/zotero-ocr-0.6.0.xpi - - - - - - - - diff --git a/updates.json b/updates.json new file mode 100644 index 0000000..05292a3 --- /dev/null +++ b/updates.json @@ -0,0 +1,17 @@ +{ + "addons": { + "zotero-ocr@bib.uni-mannheim.de": { + "updates": [ + { + "version": "0.7.0", + "update_link": "https://github.com/UB-Mannheim/zotero-ocr/releases/tag/0.7.0/zotero-ocr-0.7.0.xpi", + "applications": { + "zotero": { + "strict_min_version": "6.999" + } + } + } + ] + } + } +}