Skip to content

Commit

Permalink
Merge pull request #1937 from dadoonet/pr/pl4j
Browse files Browse the repository at this point in the history
Add support for REST plugins
  • Loading branch information
dadoonet authored Sep 30, 2024
2 parents ea289c6 + 7aae2ae commit 20d1321
Show file tree
Hide file tree
Showing 58 changed files with 2,029 additions and 219 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
# We run integration tests with elastic stack 8 (default)
it:
runs-on: ubuntu-latest
needs: build
needs: unit
steps:
- uses: actions/checkout@v4
- name: Set up JDK 21
Expand All @@ -60,7 +60,7 @@ jobs:
# We run integration tests with elastic stack 7
it-es7:
runs-on: ubuntu-latest
needs: build
needs: unit
steps:
- uses: actions/checkout@v4
- name: Set up JDK 21
Expand All @@ -80,7 +80,7 @@ jobs:
# We run integration tests with elastic stack 6
it-es6:
runs-on: ubuntu-latest
needs: build
needs: unit
steps:
- uses: actions/checkout@v4
- name: Set up JDK 21
Expand Down
14 changes: 14 additions & 0 deletions cli/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,20 @@
<artifactId>log4j-slf4j-impl</artifactId>
</dependency>

<!-- Built-in plugins -->
<dependency>
<groupId>fr.pilato.elasticsearch.crawler</groupId>
<artifactId>fscrawler-fs-local-plugin</artifactId>
</dependency>
<dependency>
<groupId>fr.pilato.elasticsearch.crawler</groupId>
<artifactId>fscrawler-fs-s3-plugin</artifactId>
</dependency>
<dependency>
<groupId>fr.pilato.elasticsearch.crawler</groupId>
<artifactId>fscrawler-fs-http-plugin</artifactId>
</dependency>

</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import fr.pilato.elasticsearch.crawler.fs.FsCrawlerImpl;
import fr.pilato.elasticsearch.crawler.fs.rest.RestServer;
import fr.pilato.elasticsearch.crawler.plugins.FsCrawlerPluginsManager;

import java.io.IOException;

Expand All @@ -28,9 +29,11 @@
class FSCrawlerShutdownHook extends Thread implements Runnable {

private final FsCrawlerImpl fsCrawler;
private final FsCrawlerPluginsManager pluginsManager;

FSCrawlerShutdownHook(FsCrawlerImpl fsCrawler) {
FSCrawlerShutdownHook(FsCrawlerImpl fsCrawler, FsCrawlerPluginsManager pluginsManager) {
this.fsCrawler = fsCrawler;
this.pluginsManager = pluginsManager;
}

@Override
Expand All @@ -39,6 +42,8 @@ public void run() {
fsCrawler.close();
// Stop the REST Server if needed
RestServer.close();
// Stop the plugins
pluginsManager.close();
} catch (InterruptedException | IOException e) {
Thread.currentThread().interrupt();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import fr.pilato.elasticsearch.crawler.fs.settings.FsSettingsFileHandler;
import fr.pilato.elasticsearch.crawler.fs.settings.FsSettingsParser;
import fr.pilato.elasticsearch.crawler.fs.settings.Server.PROTOCOL;
import fr.pilato.elasticsearch.crawler.plugins.FsCrawlerPluginsManager;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
Expand Down Expand Up @@ -66,6 +67,7 @@ public class FsCrawlerCli {
private static final long CLOSE_POLLING_WAIT_MS = 100;

private static final Logger logger = LogManager.getLogger(FsCrawlerCli.class);
private static FsCrawlerPluginsManager pluginsManager;

@SuppressWarnings("CanBeFinal")
public static class FsCrawlerCommand {
Expand Down Expand Up @@ -130,6 +132,10 @@ public static void main(String[] args) throws Exception {
// Display the welcome banner
banner();

// Load all plugins
pluginsManager = new FsCrawlerPluginsManager();
pluginsManager.loadPlugins();

// We can now launch the crawler
runner(command);
}
Expand Down Expand Up @@ -388,8 +394,10 @@ static void runner(FsCrawlerCommand command) throws IOException {
return;
}

pluginsManager.startPlugins();

try (FsCrawlerImpl fsCrawler = new FsCrawlerImpl(configDir, fsSettings, command.loop, command.rest)) {
Runtime.getRuntime().addShutdownHook(new FSCrawlerShutdownHook(fsCrawler));
Runtime.getRuntime().addShutdownHook(new FSCrawlerShutdownHook(fsCrawler, pluginsManager));
// Let see if we want to upgrade an existing cluster to the latest version
if (command.upgrade) {
logger.info("Upgrading job [{}]. No rule implemented. Skipping.", jobName);
Expand All @@ -402,7 +410,7 @@ static void runner(FsCrawlerCommand command) throws IOException {

// Start the REST Server if needed
if (command.rest) {
RestServer.start(fsSettings, fsCrawler.getManagementService(), fsCrawler.getDocumentService());
RestServer.start(fsSettings, fsCrawler.getManagementService(), fsCrawler.getDocumentService(), pluginsManager);
}

// We just have to wait until the process is stopped
Expand Down
13 changes: 12 additions & 1 deletion cli/src/main/resources/log4j2-file.xml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
</Logger>

<!-- This logger is used to log FSCrawler code execution -->
<Logger name="fr.pilato.elasticsearch.crawler.fs" level="${sys:LOG_LEVEL}" additivity="false">
<Logger name="fr.pilato.elasticsearch.crawler" level="${sys:LOG_LEVEL}" additivity="false">
<AppenderRef ref="Console" />
<AppenderRef ref="RollingFile" />
</Logger>
Expand All @@ -65,6 +65,17 @@
<Logger name="com.gargoylesoftware" level="error" additivity="false">
<AppenderRef ref="RollingFile"/>
</Logger>
<Logger name="org.pf4j" level="warn" additivity="false">
<AppenderRef ref="RollingFile" />
</Logger>
<Logger name="org.glassfish.jersey.server.wadl.WadlFeature" level="error" additivity="false">
<AppenderRef ref="Console" />
<AppenderRef ref="RollingFile" />
</Logger>
<Logger name="org.glassfish.jersey.internal.inject.Providers" level="error" additivity="false">
<AppenderRef ref="Console" />
<AppenderRef ref="RollingFile" />
</Logger>

<Root level="warn">
<AppenderRef ref="RollingFile" />
Expand Down
14 changes: 13 additions & 1 deletion cli/src/main/resources/log4j2.xml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
</Logger>

<!-- This logger is used to log FSCrawler code execution -->
<Logger name="fr.pilato.elasticsearch.crawler.fs" level="${sys:LOG_LEVEL}" additivity="false">
<Logger name="fr.pilato.elasticsearch.crawler" level="${sys:LOG_LEVEL}" additivity="false">
<AppenderRef ref="Console" />
<AppenderRef ref="RollingFile" />
</Logger>
Expand All @@ -73,6 +73,18 @@
<AppenderRef ref="Console"/>
<AppenderRef ref="RollingFile" />
</Logger>
<Logger name="org.pf4j" level="warn" additivity="false">
<AppenderRef ref="Console" />
<AppenderRef ref="RollingFile" />
</Logger>
<Logger name="org.glassfish.jersey.server.wadl.WadlFeature" level="error" additivity="false">
<AppenderRef ref="Console" />
<AppenderRef ref="RollingFile" />
</Logger>
<Logger name="org.glassfish.jersey.internal.inject.Providers" level="error" additivity="false">
<AppenderRef ref="Console" />
<AppenderRef ref="RollingFile" />
</Logger>

<Root level="warn">
<AppenderRef ref="Console" />
Expand Down
4 changes: 4 additions & 0 deletions distribution/src/main/assembly/assembly.xml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@
<include>log4j2-file.xml</include>
</includes>
</fileSet>
<fileSet>
<directory>src/main/plugins</directory>
<outputDirectory>plugins</outputDirectory>
</fileSet>
<fileSet>
<directory>src/main/external</directory>
<outputDirectory>external</outputDirectory>
Expand Down
1 change: 1 addition & 0 deletions distribution/src/main/plugins/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
You can place here your plugins. See https://fscrawler.readthedocs.io/.
6 changes: 0 additions & 6 deletions docs/source/admin/cli-options.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ CLI options
Loop
----

.. versionadded:: 2.2

``--loop x`` defines the number of runs we want before exiting:

- ``X`` where X is a negative value means infinite, like ``-1`` (default)
Expand All @@ -33,8 +31,6 @@ If you want to scan your hard drive only once, run with ``--loop 1``.
Restart
-------

.. versionadded:: 2.2

You can tell FSCrawler that it must restart from the beginning by using
``--restart`` option:

Expand All @@ -47,8 +43,6 @@ In that case, the ``{job_name}/_status.json`` file will be removed.
Rest
----

.. versionadded:: 2.3

If you want to run the :ref:`rest-service` without scanning
your hard drive, launch with:

Expand Down
20 changes: 2 additions & 18 deletions docs/source/admin/fs/elasticsearch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,6 @@ default settings using ``bulk_size``, ``byte_size`` and ``flush_interval``:
Using Ingest Node Pipeline
^^^^^^^^^^^^^^^^^^^^^^^^^^

.. versionadded:: 2.2

If you are using an elasticsearch cluster running a 5.0 or superior
version, you can use an Ingest Node pipeline to transform documents sent
by FSCrawler before they are actually indexed.
Expand Down Expand Up @@ -266,23 +264,11 @@ You can define multiple nodes:
- url: "https://mynode2.mycompany.com:9200"
- url: "https://mynode3.mycompany.com:9200"
.. note::
.. versionadded:: 2.2 you can use HTTPS instead of HTTP.

.. code:: yaml
name: "test"
elasticsearch:
nodes:
- url: "https://CLUSTERID.eu-west-1.aws.found.io:9243"
For more information about HTTPS and SSL, read :ref:`ssl`.

Path prefix
^^^^^^^^^^^

.. versionadded:: 2.7 If your elasticsearch is running behind a proxy with url rewriting,
you might have to specify a path prefix. This can be done with ``path_prefix`` setting:
If your elasticsearch is running behind a proxy with url rewriting,
you might have to specify a path prefix. This can be done with ``path_prefix`` setting:

.. code:: yaml
Expand Down Expand Up @@ -344,8 +330,6 @@ Then you can use the encoded API Key in FSCrawler settings:
Basic Authentication (deprecated)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. versionadded:: 2.2

The best practice is to use `API Key`_ or `Access Token`_. But if you have no other choice,
you can still use Basic Authentication.

Expand Down
2 changes: 0 additions & 2 deletions docs/source/admin/fs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,5 @@ Here is a list of existing top level settings:
| ``rest`` | :ref:`rest-service` |
+-----------------------------------+-------------------------------+

.. versionadded:: 2.7

You can define your job settings either in ``_settings.yaml`` (using ``.yaml`` extension) or
in ``_settings.json`` (using ``.json`` extension).
18 changes: 0 additions & 18 deletions docs/source/admin/fs/local-fs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,6 @@ Define ``fs.includes`` and ``fs.excludes`` properties in your
By default, FSCrawler will exclude files starting with ``~``.

.. versionadded:: 2.5

It also applies to directory names. So if you want to ignore ``.ignore``
dir, just add ``.ignore`` as an excluded name. Note that ``includes`` and ``excludes``
apply to directory names as well.
Expand Down Expand Up @@ -177,15 +175,11 @@ Since the includes and excludes work on the entire *path of the file* you must c
| ``*/old-*.jpg`` | Include all jpg files that start with ``old-`` | Exclude all jpg files that start with ``old-`` |
+--------------------+------------------------------------------------+------------------------------------------------+

.. versionadded:: 2.6

If a folder contains a file named ``.fscrawlerignore``, this folder and its subfolders will be entirely skipped.

Filter content
^^^^^^^^^^^^^^

.. versionadded:: 2.5

You can filter out documents you would like to index by adding one or more
regular expression that match the extracted content.
Documents which are not matching will be simply ignored and not indexed.
Expand Down Expand Up @@ -228,8 +222,6 @@ Elasticsearch will auto guess the mapping.
Indexing XML docs
^^^^^^^^^^^^^^^^^

.. versionadded:: 2.2

If you want to index XML files and convert them to JSON, you can set
``xml_support`` to ``true``. The content of XML files will be added
directly under \_source. If you need to keep XML documents synchronized
Expand Down Expand Up @@ -267,8 +259,6 @@ you should set this option.
Index folders
^^^^^^^^^^^^^

.. versionadded:: 2.2

By default FSCrawler will index folder names in the folder index. If
you don’t want to index those folders, you can set ``index_folders`` to
``false``.
Expand Down Expand Up @@ -574,8 +564,6 @@ metadata such as filename, date, size and path, you can set
Continue on Error
^^^^^^^^^^^^^^^^^

.. versionadded:: 2.3

By default FSCrawler will immediately stop indexing if he hits a
Permission denied exception. If you want to just skip this File and
continue with the rest of the directory tree you can set
Expand All @@ -590,8 +578,6 @@ continue with the rest of the directory tree you can set
Language detection
^^^^^^^^^^^^^^^^^^

.. versionadded:: 2.2

You can ask for language detection using ``lang_detect`` option:

.. code:: yaml
Expand Down Expand Up @@ -721,8 +707,6 @@ If you want to extract the full content, define ``indexed_chars`` to
Ignore Above
^^^^^^^^^^^^

.. versionadded:: 2.5

By default (if ``index_content`` set to ``true``) FSCrawler will send every single file to Tika, whatever its size.
But some files on your file system might be a way too big to be parsed.

Expand Down Expand Up @@ -759,8 +743,6 @@ such as ``MD5`` or ``SHA-1``.
Follow Symlinks
^^^^^^^^^^^^^^^

.. versionadded:: 2.7

If you want FSCrawler to follow the symbolic links, you need to be explicit about it and set
``follow_symlink`` to ``true``. Starting from version 2.7, symbolic links are not followed anymore.

Expand Down
Loading

0 comments on commit 20d1321

Please sign in to comment.