diff --git a/.gitignore b/.gitignore index 0612a99c23..2274f8974b 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,14 @@ naivebayes-model csvindexwriter lib/spotbugs-* ivy/dependency-check-ant/* + +# Ignore Gradle project-specific cache directory +.gradle + +# Ignore Gradle build output directory +build + +# Ignore Gradle Wrapper +gradle/ +gradlew +gradlew.bat \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000000..efc3b628ef --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,65 @@ +#!groovy + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +def AGENT_LABEL = env.AGENT_LABEL ?: 'ubuntu' +// ================================================================= +// https://cwiki.apache.org/confluence/display/INFRA/Jenkins +// https://cwiki.apache.org/confluence/display/INFRA/Multibranch+Pipeline+recipes +// ================================================================= + +// general pipeline documentation: https://jenkins.io/doc/book/pipeline/syntax/ +pipeline { + agent { + node { + label AGENT_LABEL + } + } + + environment { + LANG = 'C.UTF-8' +} + + stages { + stage('Build') { + steps { + sh "gradle build" + } + + post { + success { + archiveArtifacts '**/target/*.jar' + } + } + } + + stage('Test') { + steps { + sh "gradle test" + } + + post { + always { + junit testResults: '**/target/surefire-reports/TEST-*.xml', testDataPublishers: [[$class: 'StabilityTestDataPublisher']] + } + } + } + } +} diff --git a/build.gradle.kts b/build.gradle.kts new file mode 100644 index 0000000000..b584701cf6 --- /dev/null +++ b/build.gradle.kts @@ -0,0 +1,579 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +plugins { + application + base + java + `maven-publish` + `java-library` +} + +repositories { + mavenCentral() +} + +dependencies { + val cxfVersion by extra { "3.4.1" } + val hadoopVersion by extra { "3.1.3" } + val jacksonVersion by extra { "2.12.0" } + val log4j2Version by extra { "2.17.0" } + val mortbayJettyVersion by extra { "6.1.26" } + + implementation("com.ibm.icu:icu4j:68.2") + implementation("com.fasterxml.jackson.core:jackson-annotations:${jacksonVersion}") + implementation("com.fasterxml.jackson.core:jackson-databind:${jacksonVersion}") + implementation("com.fasterxml.jackson.dataformat:jackson-dataformat-cbor:${jacksonVersion}") + implementation("com.fasterxml.jackson.jaxrs:jackson-jaxrs-json-provider:${jacksonVersion}") + implementation("com.github.crawler-commons:crawler-commons:1.2") + implementation("com.google.code.gson:gson:2.8.9") + implementation("com.google.guava:guava:30.1-jre") + implementation("com.martinkl.warc:warc-hadoop:0.1.0") { + exclude(module="hadoop-client") + } + implementation("com.rabbitmq:amqp-client:5.2.0") + implementation("com.tdunning:t-digest:3.2") + implementation("commons-codec:commons-codec:1.15") + implementation("de.vandermeer:asciitable:0.3.2") + implementation("org.apache.commons:commons-collections4:4.4") + implementation("org.apache.commons:commons-compress:1.21") + implementation("org.apache.commons:commons-jexl3:3.1") + implementation("org.apache.commons:commons-lang3:3.12.0") + implementation("org.apache.cxf:cxf-rt-frontend-jaxrs:${cxfVersion}") + implementation("org.apache.cxf:cxf-rt-frontend-jaxws:${cxfVersion}") + implementation("org.apache.cxf:cxf-rt-transports-http:${cxfVersion}") + implementation("org.apache.cxf:cxf-rt-transports-http-jetty:${cxfVersion}") + implementation("org.apache.hadoop:hadoop-common:${hadoopVersion}") { + exclude("ant", "ant") + exclude("hsqldb", "hsqldb") + exclude("net.java.dev.jets3t", "jets3t") + exclude("net.sf.kosmosfs", "kfs") + exclude("org.eclipse.jdt", "core") + exclude("org.mortbay.jetty", "jsp-*") + } + implementation("org.apache.hadoop:hadoop-hdfs:${hadoopVersion}") + implementation("org.apache.hadoop:hadoop-mapreduce-client-core:${hadoopVersion}") + implementation("org.apache.hadoop:hadoop-mapreduce-client-jobclient:${hadoopVersion}") + implementation("org.apache.httpcomponents:httpcore:4.4.14") + implementation("org.apache.httpcomponents:httpcore-nio:4.4.9") + implementation("org.apache.httpcomponents:httpclient:4.5.13") + implementation("org.apache.logging.log4j:log4j-api:${log4j2Version}") + implementation("org.apache.logging.log4j:log4j-core:${log4j2Version}") + implementation("org.apache.logging.log4j:log4j-slf4j-impl:${log4j2Version}") + implementation("org.apache.tika:tika-core:2.2.1") + implementation("org.mortbay.jetty:jetty:${mortbayJettyVersion}") + implementation("org.netpreserve.commons:webarchive-commons:1.1.9") { + exclude(group="com.google.guava") + exclude("it.unimi.dsi", "dsiutils") + because("Incompatible LGPL 2.1 licence; exclusion disables support of WARC generation by 'bin/nutch commoncrawldump -warc ...'. Remove this exclusion and recompile Nutch to generate WARC files using the 'commoncrawldump' tool.") + exclude(group="junit") + exclude(module="hadoop-core") + exclude("org.gnu.inet", "libidn") + because("Incompatible LGPL 2.1 licence; exclusion disables support of WARC generation by 'bin/nutch commoncrawldump -warc ...'. Remove this exclusion and recompile Nutch to generate WARC files using the 'commoncrawldump' tool.") + exclude(group="org.json") + because("Incompatible JSON licence.") + } + implementation("org.slf4j:slf4j-api:1.7.35") + implementation("xerces:xercesImpl:2.12.1") + implementation("xml-apis:xml-apis:1.4.01") { + because("Force this version as it is required by Tika.") + } + + testImplementation("junit:junit:4.13.2") + testImplementation("org.apache.cxf:cxf-rt-rs-client:${cxfVersion}") + testImplementation("org.apache.mrunit:mrunit:1.1.0:hadoop2") { + exclude("log4j", "log4j") + } + testImplementation("org.mortbay.jetty:jetty-client:${mortbayJettyVersion}") +} + +application { + // Define the main class for the application. + //mainClass.set("nutch.App") +} + +publishing { + publications { + create("maven") { + groupId = "org.apache.nutch" + artifactId = "nutch" + version = "1.18" + + from(components["java"]) + } + } + repositories { + maven { + url = uri("https://repository.apache.org/service/local/staging/deploy/maven2") + } + } +} + +configurations { + implementation { + resolutionStrategy.failOnVersionConflict() + } +} + +configurations.all { + exclude(group="com.thoughtworks.xstream") + exclude(module="jms") + exclude(module="jmx-tools") + exclude(module="jmxri") + exclude(module="log4j") + exclude(module="slf4j-log4j12") +} + +configure { + sourceCompatibility=JavaVersion.VERSION_11 + targetCompatibility=JavaVersion.VERSION_11 +} + +// the normal classpath +val classpathCollection: FileCollection = layout.files( + file("${project.properties["build.classes"]}"), + fileTree(mapOf("dir" to project.properties["build.lib.dir"], "include" to listOf("*.jar"))) +) +val classPath: String = classpathCollection.asPath + +// test classpath +val testClasspathCollection: FileCollection = layout.files( + file("${project.properties["test.build.classes"]}"), + file("${project.properties["conf.dir"]}"), + file("${project.properties["test.src.dir"]}"), + file("${project.properties["build.plugins"]}"), + classpathCollection, + file(layout.buildDirectory.dir("${project.properties["build.dir"]}/${project.properties["final.name"]}.job")), + fileTree(mapOf("dir" to project.properties["build.lib.dir"], "include" to listOf("*.jar"))), + fileTree(mapOf("dir" to project.properties["test.build.lib.dir"], "include" to listOf("*.jar"))) +) + +// legacy ant target "init" renamed to "init-nutch" to avoid gradle naming conflicts +tasks.register("init-nutch") { + group = "gradleBuildSystem" + description = "Stuff required by all targets" + + // making six directories + mkdir("${project.properties["build.dir"]}") + mkdir("${project.properties["build.classes"]}") + mkdir("${project.properties["build.dir"]}/release") + mkdir("${project.properties["test.build.dir"]}") + mkdir("${project.properties["test.build.classes"]}") + mkdir("${project.properties["test.build.lib.dir"]}") + + // renaming from *.template to * for all files in folders in conf.dir + from(layout.projectDirectory.dir("${project.properties["conf.dir"]}")) + include("**/*.template") + rename { filename: String -> + filename.replace(".template", "") + } + into(layout.projectDirectory.dir("${project.properties["conf.dir"]}")) +} + +tasks.register("resolve-default") { + group = "gradleBuildSystem" + description = "Resolve and retrieve dependencies" + dependsOn("clean-default-lib","init-nutch","copy-libs") + from(configurations.compileClasspath) + from(configurations.runtimeClasspath) + into(layout.buildDirectory.dir("${project.properties["lib.dir"]}")) +} + +tasks.register("resolve-test") { + group = "gradleBuildSystem" + description = "Resolve and retrieve dependencies" + dependsOn("clean-test-lib","init-nutch","copy-libs") + from(configurations.testCompileClasspath) + from(configurations.testRuntimeClasspath) + into(layout.projectDirectory.dir("${project.properties["test.build.lib.dir"]}")) +} + +tasks.register("compile") { + group = "gradleBuildSystem" + description = "Compile all Java files" + dependsOn("compile-core","compile-plugins") +} + +tasks.register("compile-core") { + group = "gradleBuildSystem" + description = "Compile core Java files only" + dependsOn("init-nutch","resolve-default","compileJava") + source = fileTree("${project.properties["src.dir"]}") + include("org/apache/nutch/**/*.java") + destinationDirectory.set(layout.projectDirectory.dir("${project.properties["build.classes"]}")) + classpath = classpathCollection + sourceCompatibility = "${project.properties["javac.version"]}" + targetCompatibility = "${project.properties["javac.version"]}" + options.annotationProcessorPath = classpathCollection + options.sourcepath = layout.files("${project.properties["src.dir"]}") + options.compilerArgs.add("-Xlint:-path") + options.compilerArgs.add("-Xpkginfo:always") + options.encoding = "${project.properties["build.encoding"]}" + options.isDebug = "${project.properties["javac.debug"]}" == "on" + options.isDeprecation = "${project.properties["javac.deprecation"]}" == "on" + + copy { + from(layout.projectDirectory.dir("${project.properties["src.dir"]}")) + include("**/*.html", "**/*.css", "**/*.properties") + into(layout.projectDirectory.dir("${project.properties["build.classes"]}")) + } + doLast { + delete("${project.properties["build.dir"]}/tmp") + } +} + +tasks.register("proxy") { + group = "gradleBuildSystem" + description = "Run nutch proxy" + dependsOn("compile-core-test","job") + + mainClass.set("org.apache.nutch.tools.proxy.ProxyTestbed") + classpath = testClasspathCollection + args("-fake") + jvmArgs("-Djavax.xml.parsers.DocumentBuilderFactory=com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl") +} + +tasks.register("benchmark") { + group = "gradleBuildSystem" + description = "Run nutch benchmarking analysis" + + mainClass.set("org.apache.nutch.tools.Benchmark") + classpath = testClasspathCollection + jvmArgs("-Xmx512m -Djavax.xml.parsers.DocumentBuilderFactory=com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl") + args("-maxPerHost") + args("10") + args("-seeds") + args("1") + args("-depth") + args("5") +} + +tasks.clean { + group = "gradleBuildSystem" + description = "Clean the project" + dependsOn("clean-build","clean-lib","clean-dist","clean-runtime") +} + +tasks.register("clean-lib") { + group = "gradleBuildSystem" + description = "Clean the project libraries directories (dependencies: default + test)" + dependsOn("clean-default-lib","clean-test-lib") +} + +tasks.register("clean-default-lib") { + group = "gradleBuildSystem" + description = "Clean the project libraries directory (dependencies)" + delete("${project.properties["build.lib.dir"]}") +} + +tasks.register("clean-test-lib") { + group = "gradleBuildSystem" + description = "Clean the project test libraries directory (dependencies)" + delete("${project.properties["test.build.lib.dir"]}") +} + +tasks.register("clean-build") { + group = "gradleBuildSystem" + description = "Clean the project built files" + delete("${project.properties["build.dir"]}") +} + +tasks.register("clean-dist") { + group = "gradleBuildSystem" + description = "Clean the project dist files" + delete("${project.properties["dist.dir"]}") +} + +tasks.register("clean-runtime") { + group = "gradleBuildSystem" + description = "Clean the project runtime area" + delete("${project.properties["runtime.dir"]}") +} + +tasks.register("copy-libs") { + group = "gradleBuildSystem" + description = "Copy the libs in lib" + from("${project.properties["lib.dir"]}") { + include("**/*.jar") + } + into("${project.properties["build.lib.dir"]}") +} + +tasks.register("compile-plugins") { + group = "gradleBuildSystem" + description = "Compile plugins only" + dependsOn("init-nutch","resolve-default") + //TODO Once plugins are finished, uncomment the following lines: + // dir = file("src/plugin") + // tasks = listOf("deploy") +} + +tasks.jar { + group = "gradleBuildSystem" + description = "Make nutch.jar" + dependsOn("compile-core") + + copy { + from( + file(layout.projectDirectory.dir("${project.properties["conf.dir"]}/nutch-default.xml")), + file(layout.projectDirectory.dir("${project.properties["conf.dir"]}/nutch-site.xml")) + ) + into("${project.properties["build.classes"]}") + } + archiveFileName.set("${project.properties["final.name"]}.jar") + destinationDirectory.set(layout.projectDirectory.dir("${project.properties["build.dir"]}")) + from(files("${project.properties["build.classes"]}")) + doLast { + delete("${project.properties["build.dir"]}/tmp") + } +} + +tasks.register("runtime") { + group = "gradleBuildSystem" + description = "Default target for running Nutch" + dependsOn("jar","job") + mkdir("${project.properties["runtime.dir"]}") + mkdir("${project.properties["runtime.local"]}") + mkdir("${project.properties["runtime.deploy"]}") + + into(layout.projectDirectory) + + into("${project.properties["runtime.deploy"]}") { + from(layout.projectDirectory.dir("${project.properties["build.dir"]}/${project.properties["final.name"]}.job")) + } + into("${project.properties["runtime.deploy"]}/bin") { + from(layout.projectDirectory.dir("src/bin")) + } + into("${project.properties["runtime.local"]}/lib") { + from(layout.projectDirectory.dir("${project.properties["build.dir"]}/${project.properties["final.name"]}.jar")) + } + into("${project.properties["runtime.local"]}/lib/native") { + from(layout.projectDirectory.dir("lib/native")) + } + into("${project.properties["runtime.local"]}/conf") { + from(layout.projectDirectory.dir("${project.properties["conf.dir"]}")) { + exclude("*.template") + } + } + into("${project.properties["runtime.local"]}/bin") { + from(layout.projectDirectory.dir("src/bin")) + } + into("${project.properties["runtime.local"]}/lib") { + from(layout.projectDirectory.dir("${project.properties["build.lib.dir"]}")) + } + into("${project.properties["runtime.local"]}/plugins") { + from(layout.projectDirectory.dir("${project.properties["build.dir"]}/plugins")) + } + into("${project.properties["runtime.local"]}/test") { + from(layout.projectDirectory.dir("${project.properties["build.dir"]}/test")) + } + + doLast() { + project.exec() { + commandLine("chmod","ugo+x","${project.properties["runtime.deploy"]}/bin") + commandLine("chmod","ugo+x","${project.properties["runtime.local"]}/bin") + } + } +} + +tasks.register("job") { + group = "gradleBuildSystem" + description = "Make nutch.job jar" + dependsOn("compile") + + duplicatesStrategy = DuplicatesStrategy.EXCLUDE + archiveFileName.set("${project.properties["final.name"]}.job") + destinationDirectory.set(layout.projectDirectory.dir("${project.properties["build.dir"]}")) + from(layout.projectDirectory.dir("${project.properties["build.classes"]}")) { + exclude("nutch-default.xml","nutch-site.xml") + } + from(layout.projectDirectory.dir("${project.properties["conf.dir"]}")) { + exclude("*.template","hadoop*.*") + } + from(layout.projectDirectory.dir("${project.properties["build.lib.dir"]}")) { + include("**/*.jar") + exclude("hadoop-*.jar","slf4j*.jar","log4j*.jar") + into("lib") + } + from(layout.projectDirectory.dir("${project.properties["build.plugins"]}")) { + exclude("nutch-default.xml","nutch-site.xml") + into("classes/plugins") + } + doLast { + delete("${project.properties["build.dir"]}/tmp") + } +} + +tasks.register("compile-core-test") { + group = "gradleBuildSystem" + description = "Compile test code" + dependsOn("init-nutch","compile-core","resolve-test","compileTestJava") + + source = fileTree("${project.properties["test.src.dir"]}") + include("org/apache/nutch/**/*.java") + destinationDirectory.set(layout.projectDirectory.dir("${project.properties["test.build.classes"]}")) + classpath = testClasspathCollection + sourceCompatibility = "${project.properties["javac.version"]}" + targetCompatibility = "${project.properties["javac.version"]}" + + options.annotationProcessorPath = testClasspathCollection + options.sourcepath = layout.files("${project.properties["src.dir"]}") + options.compilerArgs.add("-Xlint:-path") + options.compilerArgs.add("-Xpkginfo:always") + options.encoding = "${project.properties["build.encoding"]}" + options.isDebug = "${project.properties["javac.debug"]}" == "on" + options.isDeprecation = "${project.properties["javac.deprecation"]}" == "on" +} + +tasks.test.configure { + description = "Run JUnit tests" + dependsOn("test-core","test-plugins") +} + + +tasks.javadoc.configure { + group = "gradleBuildSystem" + description = "Generate Javadoc" + dependsOn("compile") + + val version:String = System.getProperty("java.version") + if("1.7.0_25".compareTo(version) >= 0) + throw GradleException( + "Unsupported Java version: ${version}. Javadoc requires Java version 7u25 " + + "or greater. See https://issues.apache.org/jira/browse/NUTCH-1590" + ) + + mkdir("${project.properties["build.javadoc"]}") + mkdir("${project.properties["build.javadoc"]}/resources") + + source(fileTree("${project.properties["src.dir"]}")) + // todo: add plugins to source + include("**/*.java") + setDestinationDir(file("${project.properties["build.javadoc"]}")) + classpath = classpathCollection.plus( + layout.files( + fileTree(mapOf("dir" to project.properties["build.plugins"], "include" to listOf("**/*.jar"), "exclude" to listOf("any23/javax.annotation-api*.jar"))) + ) + ) + options { + this as StandardJavadocDocletOptions + overview = "${project.properties["src.dir"]}/overview.html" + windowTitle = "${project.properties["projname"]} ${project.properties["version"]} API" + docTitle = windowTitle + bottom = "Copyright \u00a9 ${project.properties["year"]} The Apache Software Foundation" + isAuthor = true + isVersion = true + isUse = true + links("${project.properties["javadoc.link.java"]}","${project.properties["javadoc.link.hadoop"]}") + groups = mutableMapOf( + "Core" to mutableListOf("org.apache.nutch.*"), + "Plugins API" to "${project.properties["plugins.api"]}" .split(":"), + "Protocol Plugins" to "${project.properties["plugins.protocol"]}" .split(":"), + "URL Filter Plugins" to "${project.properties["plugins.urlfilter"]}" .split(":"), + "URL Normalizer Plugins" to "${project.properties["plugins.urlnormalizer"]}".split(":"), + "Scoring Plugins" to "${project.properties["plugins.scoring"]}" .split(":"), + "Parse Plugins" to "${project.properties["plugins.parse"]}" .split(":"), + "Parse Filter Plugins" to "${project.properties["plugins.parsefilter"]}" .split(":"), + "Publisher Plugins" to "${project.properties["plugins.publisher"]}" .split(":"), + "Exchange Plugins" to "${project.properties["plugins.exchange"]}" .split(":"), + "Indexing Filter Plugins" to "${project.properties["plugins.index"]}" .split(":"), + "Indexer Plugins" to "${project.properties["plugins.indexer"]}" .split(":"), + "Misc. Plugins" to "${project.properties["plugins.misc"]}" .split(":") + ) + addBooleanOption("-allow-script-in-comments", true) + addStringOption("${project.properties["javadoc.proxy.host"]}") + addStringOption("${project.properties["javadoc.proxy.port"]}") + } + + + doLast { + copy { + from(file(layout.projectDirectory.dir("${project.properties["plugins.dir"]}/plugin.dtd"))) + into("${project.properties["build.javadoc"]}/org/apache/nutch/plugin/doc-files") + } + copy { + from( + file(layout.projectDirectory.dir("${project.properties["conf.dir"]}/nutch-default.xml")), + file(layout.projectDirectory.dir("${project.properties["conf.dir"]}/configuration.xsl")) + ) + into("${project.properties["build.javadoc"]}/resources") + } + delete("${project.properties["build.dir"]}/tmp") + } +} + +tasks.register("package-src") { + group = "gradleBuildSystem" + description = "Generate source distribution package" + dependsOn("runtime","javadoc") + + destinationDir = file(".") + + mkdir("${project.properties["dist.dir"]}") + mkdir("${project.properties["src.dist.version.dir"]}") + mkdir("${project.properties["src.dist.version.dir"]}/lib") + mkdir("${project.properties["src.dist.version.dir"]}/docs") + mkdir("${project.properties["src.dist.version.dir"]}/docs/api") + mkdir("${project.properties["src.dist.version.dir"]}/ivy") + + from("lib") { + includeEmptyDirs = false + into("${project.properties["src.dist.version.dir"]}/lib") + } + from("${project.properties["conf.dir"]}") { + exclude("**/*.template") + into("${project.properties["src.dist.version.dir"]}/conf") + } + from("${project.properties["build.javadoc"]}") { + into("${project.properties["src.dist.version.dir"]}/docs/api") + } + from(".") { + include("*.txt") + into("${project.properties["src.dist.version.dir"]}") + } + from("src") { + includeEmptyDirs = true + into("${project.properties["src.dist.version.dir"]}/src") + } + from(".") { + include("build.gradle.kts","gradle.properties","settings.gradle.kts") + into("${project.properties["src.dist.version.dir"]}/") + } +} + +tasks.register("zip-src") { + group = "gradleBuildSystem" + description = "Generate src.zip distribution package" + dependsOn("package-src") + + archiveFileName.set("${project.properties["src.dist.version.dir"]}.zip") + destinationDirectory.set(layout.projectDirectory) + + from( + files("${project.properties["src.dist.version.dir"]}") { + fileMode = 664 + exclude("src/bin/*") + //TODO delete the following line once Ivy is removed completely + exclude("ivy") + include("**") + }, + files("${project.properties["src.dist.version.dir"]}") { + fileMode = 755 + include("src/bin/*") + } + ) +} diff --git a/gradle.properties b/gradle.properties new file mode 100644 index 0000000000..5d9d60ebb9 --- /dev/null +++ b/gradle.properties @@ -0,0 +1,197 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +projname=apache-nutch +version=1.19-SNAPSHOT +final.name=apache-nutch-1.19-SNAPSHOT +year=2022 + +src.dir = src/java +lib.dir = lib +conf.dir = conf +plugins.dir = src/plugin + +build.dir = build +build.classes = build/classes +build.plugins = build/plugins +build.javadoc = build/docs/api +build.encoding = UTF-8 +build.lib.dir = build/lib + +test.src.dir = src/test +test.build.dir = build/test +test.build.lib.dir = build/test/lib +test.build.data = build/test/data +test.build.classes = build/test/classes +test.build.javadoc = build/test/docs/api +test.junit.output.format = plain + +# Proxy Host and Port to use for building JavaDoc +javadoc.proxy.host = -J-DproxyHost= +javadoc.proxy.port = -J-DproxyPort= +javadoc.link.java = https://docs.oracle.com/en/java/javase/11/docs/api/ +javadoc.link.hadoop = https://hadoop.apache.org/docs/r3.1.3/api/ +javadoc.packages = org.apache.nutch.* + +dist.dir= dist +src.dist.version.dir = dist/apache-nutch-1.19-SNAPSHOT-src +bin.dist.version.dir = dist/apache-nutch-1.19-SNAPSHOT-bin + +javac.debug = true +javac.deprecation = true +javac.version = 11 + +runtime.dir = runtime +runtime.deploy = runtime/deploy +runtime.local = runtime/local + +# +# Plugins API +# +plugins.api=\ + org.apache.nutch.protocol.http.api*:\ + org.apache.nutch.urlfilter.api* + +# +# Protocol Plugins +# +plugins.protocol=\ + org.apache.nutch.protocol.file*:\ + org.apache.nutch.protocol.ftp*:\ + org.apache.nutch.protocol.http*:\ + org.apache.nutch.protocol.httpclient*:\ + org.apache.nutch.protocol.interactiveselenium*:\ + org.apache.nutch.protocol.okhttp*:\ + org.apache.nutch.protocol.selenium*:\ + org.apache.nutch.protocol.htmlunit*:\ +# +# URL Filter Plugins +# +plugins.urlfilter=\ + org.apache.nutch.urlfilter.automaton*:\ + org.apache.nutch.urlfilter.domain*:\ + org.apache.nutch.urlfilter.domaindenylist*:\ + org.apache.nutch.urlfilter.fast*:\ + org.apache.nutch.urlfilter.ignoreexempt*:\ + org.apache.nutch.urlfilter.prefix*:\ + org.apache.nutch.urlfilter.regex*:\ + org.apache.nutch.urlfilter.suffix*:\ + org.apache.nutch.urlfilter.validator* + +# +# URL Normalizer Plugins +# +plugins.urlnormalizer=\ + org.apache.nutch.net.urlnormalizer.ajax*:\ + org.apache.nutch.net.urlnormalizer.basic*:\ + org.apache.nutch.net.urlnormalizer.host*:\ + org.apache.nutch.net.urlnormalizer.pass*:\ + org.apache.nutch.net.urlnormalizer.protocol*:\ + org.apache.nutch.net.urlnormalizer.querystring*:\ + org.apache.nutch.net.urlnormalizer.regex*:\ + org.apache.nutch.net.urlnormalizer.slash* + +# +# Scoring Plugins +# +plugins.scoring=\ + org.apache.nutch.scoring.depth*:\ + org.apache.nutch.scoring.link*:\ + org.apache.nutch.scoring.opic*:\ + org.apache.nutch.scoring.orphan*:\ + org.apache.nutch.scoring.similarity*:\ + org.apache.nutch.scoring.tld*:\ + org.apache.nutch.scoring.urlmeta*:\ + org.apache.nutch.scoring.metadata* + +# +# Parse Plugins +# +plugins.parse=\ + org.apache.nutch.parse.ext*:\ + org.apache.nutch.parse.feed*:\ + org.apache.nutch.parse.html*:\ + org.apache.nutch.parse.js:\ + org.apache.nutch.parse.replace*:\ + org.apache.nutch.parse.swf*:\ + org.apache.nutch.parse.tika:\ + org.apache.nutch.parse.zip + +# +# Parse Filter Plugins +# +plugins.parsefilter=\ + org.apache.nutch.parsefilter.debug*:\ + org.apache.nutch.parse.headings*:\ + org.apache.nutch.parsefilter.naivebayes*:\ + org.apache.nutch.parsefilter.regex*:\ + org.apache.nutch.parse.metatags* + +# +# Publisher Plugins +# +plugins.publisher=\ + org.apache.nutch.publisher.rabbitmq* + +# +# Exchange Plugins +# +plugins.exchange=\ + org.apache.nutch.exchange.jexl* + +# +# Indexing Filter Plugins +# +plugins.index=\ + org.apache.nutch.indexer.anchor*:\ + org.apache.nutch.indexer.basic*:\ + org.apache.nutch.indexer.feed*:\ + org.apache.nutch.indexer.geoip*:\ + org.apache.nutch.indexer.jexl*:\ + org.apache.nutch.indexer.filter*:\ + org.apache.nutch.indexer.links*:\ + org.apache.nutch.indexer.metadata*:\ + org.apache.nutch.indexer.more*:\ + org.apache.nutch.indexer.replace*:\ + org.apache.nutch.indexer.staticfield*:\ + org.apache.nutch.indexer.subcollection*:\ + org.apache.nutch.indexer.tld*:\ + org.apache.nutch.indexer.urlmeta* + +# +# Indexing Backend Plugins +# +plugins.indexer=\ + org.apache.nutch.indexwriter.cloudsearch*:\ + org.apache.nutch.indexwriter.csv*:\ + org.apache.nutch.indexwriter.dummy*:\ + org.apache.nutch.indexwriter.elastic*:\ + org.apache.nutch.indexwriter.rabbit*:\ + org.apache.nutch.indexwriter.kafka*:\ + org.apache.nutch.indexwriter.solr* + +# +# Misc. Plugins +# +# (gathers plugins that cannot be dispatched +# in any category, mainly because they contain +# many extension points) +# +plugins.misc=\ + org.apache.nutch.collection*:\ + org.apache.nutch.analysis.lang*:\ + org.creativecommons.nutch*:\ + org.apache.nutch.microformats.reltag*:\ + org.apache.nutch.any23* diff --git a/settings.gradle.kts b/settings.gradle.kts new file mode 100644 index 0000000000..5f6b6be4b8 --- /dev/null +++ b/settings.gradle.kts @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +rootProject.name = "nutch" + +// includeBuild("src/plugin/any23") +// includeBuild("src/plugin/build-plugin.xml") +// includeBuild("src/plugin/creativecommons") +// includeBuild("src/plugin/exchange-jexl") +// includeBuild("src/plugin/feed") +// includeBuild("src/plugin/headings") +// includeBuild("src/plugin/index-anchor") +// includeBuild("src/plugin/index-basic") +// includeBuild("src/plugin/index-geoip") +// includeBuild("src/plugin/index-jexl-filter") +// includeBuild("src/plugin/index-links") +// includeBuild("src/plugin/index-metadata") +// includeBuild("src/plugin/index-more") +// includeBuild("src/plugin/index-replace") +// includeBuild("src/plugin/index-static") +// includeBuild("src/plugin/indexer-cloudsearch") +// includeBuild("src/plugin/indexer-csv") +// includeBuild("src/plugin/indexer-dummy") +// includeBuild("src/plugin/indexer-elastic") +// includeBuild("src/plugin/indexer-kafka") +// includeBuild("src/plugin/indexer-rabbit") +// includeBuild("src/plugin/indexer-solr") +// includeBuild("src/plugin/language-identifier") +// includeBuild("src/plugin/lib-htmlunit") +// includeBuild("src/plugin/lib-http") +// includeBuild("src/plugin/lib-nekohtml") +// includeBuild("src/plugin/lib-rabbitmq") +// includeBuild("src/plugin/lib-regex-filter") +// includeBuild("src/plugin/lib-selenium") +// includeBuild("src/plugin/lib-xml") +// includeBuild("src/plugin/microformats-reltag") +// includeBuild("src/plugin/mimetype-filter") +// includeBuild("src/plugin/nutch-extensionpoints") +// includeBuild("src/plugin/parse-ext") +// includeBuild("src/plugin/parse-html") +// includeBuild("src/plugin/parse-js") +// includeBuild("src/plugin/parse-metatags") +// includeBuild("src/plugin/parse-swf") +// includeBuild("src/plugin/parse-tika") +// includeBuild("src/plugin/parse-zip") +// includeBuild("src/plugin/parsefilter-debug") +// includeBuild("src/plugin/parsefilter-naivebayes") +// includeBuild("src/plugin/parsefilter-regex") +// includeBuild("src/plugin/protocol-file") +// includeBuild("src/plugin/protocol-foo") +// includeBuild("src/plugin/protocol-ftp") +// includeBuild("src/plugin/protocol-htmlunit") +// includeBuild("src/plugin/protocol-http") +// includeBuild("src/plugin/protocol-httpclient") +// includeBuild("src/plugin/protocol-interactiveselenium") +// includeBuild("src/plugin/protocol-okhttp") +// includeBuild("src/plugin/protocol-selenium") +// includeBuild("src/plugin/publish-rabbitmq") +// includeBuild("src/plugin/scoring-depth") +// includeBuild("src/plugin/scoring-link") +// includeBuild("src/plugin/scoring-metadata") +// includeBuild("src/plugin/scoring-opic") +// includeBuild("src/plugin/scoring-orphan") +// includeBuild("src/plugin/scoring-similarity") +// includeBuild("src/plugin/subcollection") +// includeBuild("src/plugin/tld") +// includeBuild("src/plugin/urlfilter-automaton") +// includeBuild("src/plugin/urlfilter-domain") +// includeBuild("src/plugin/urlfilter-domaindenylist") +// includeBuild("src/plugin/urlfilter-fast") +// includeBuild("src/plugin/urlfilter-ignoreexempt") +// includeBuild("src/plugin/urlfilter-prefix") +// includeBuild("src/plugin/urlfilter-regex") +// includeBuild("src/plugin/urlfilter-suffix") +// includeBuild("src/plugin/urlfilter-validator") +// includeBuild("src/plugin/urlmeta") +// includeBuild("src/plugin/urlnormalizer-ajax") +// includeBuild("src/plugin/urlnormalizer-basic") +// includeBuild("src/plugin/urlnormalizer-host") +// includeBuild("src/plugin/urlnormalizer-pass") +// includeBuild("src/plugin/urlnormalizer-protocol") +// includeBuild("src/plugin/urlnormalizer-querystring") +// includeBuild("src/plugin/urlnormalizer-regex") +// includeBuild("src/plugin/urlnormalizer-slash") diff --git a/src/java/org/apache/nutch/segment/SegmentMerger.java b/src/java/org/apache/nutch/segment/SegmentMerger.java index 056df3c882..86e61fee28 100644 --- a/src/java/org/apache/nutch/segment/SegmentMerger.java +++ b/src/java/org/apache/nutch/segment/SegmentMerger.java @@ -76,7 +76,7 @@ *

* Also, it's possible to slice the resulting segment into chunks of fixed size. *

- *

Important Notes

Which parts are merged?

+ *

Important Notes

Which parts are merged?

*

* It doesn't make sense to merge data from segments, which are at different * stages of processing (e.g. one unfetched segment, one fetched but not parsed, diff --git a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java index 0a93947e43..b5cf6ae951 100644 --- a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java +++ b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java @@ -42,9 +42,8 @@ * Arc files are essentially tars of gzips. Each record in an arc file is a * compressed gzip. Multiple records are concatenated together to form a * complete arc.

- *

For more information on the arc file format - * @see ArcFileFormat. - *

+ *

For more information on the arc file format

+ * @see ArcFileFormat

.

* *

* Arc files are used by the internet archive and grub projects. diff --git a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java index dd8605f792..64885d7e88 100644 --- a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java +++ b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java @@ -78,7 +78,7 @@ * expressions, it only accepts literal suffixes. I.e. a suffix "+*.jpg" is most * probably wrong, you should use "+.jpg" instead. *

- *

Example 1

+ *

Example 1

*

* The configuration shown below will accept all URLs with '.html' or '.htm' * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit @@ -96,7 +96,7 @@ * .htm * * - *

Example 2

+ *

Example 2

*

* The configuration shown below will accept all URLs except common graphical * formats.