From b33f27ac7b81c791f1609cee71a42395476fe2cc Mon Sep 17 00:00:00 2001 From: nruest Date: Thu, 30 Sep 2021 13:39:02 -0400 Subject: [PATCH 01/11] Rip out Java code. * Partially address #494 --- README.md | 1 - pom.xml | 44 ---- .../data/ArcRecordUtils.java | 180 --------------- .../data/ArchiveRecordInputFormat.java | 206 ------------------ .../data/ArchiveRecordWritable.java | 147 ------------- .../data/WarcRecordUtils.java | 172 --------------- .../archivesunleashed/data/package-info.java | 17 -- .../archivesunleashed/data/ArcLoaderTest.java | 101 --------- .../data/ArchiveRecordInputFormatTest.java | 166 -------------- .../data/ArchiveRecordWritableTest.java | 143 ------------ .../data/WarcLoaderTest.java | 157 ------------- 11 files changed, 1334 deletions(-) delete mode 100644 src/main/java/io/archivesunleashed/data/ArcRecordUtils.java delete mode 100644 src/main/java/io/archivesunleashed/data/ArchiveRecordInputFormat.java delete mode 100644 src/main/java/io/archivesunleashed/data/ArchiveRecordWritable.java delete mode 100644 src/main/java/io/archivesunleashed/data/WarcRecordUtils.java delete mode 100644 src/main/java/io/archivesunleashed/data/package-info.java delete mode 100644 src/test/java/io/archivesunleashed/data/ArcLoaderTest.java delete mode 100644 src/test/java/io/archivesunleashed/data/ArchiveRecordInputFormatTest.java delete mode 100644 src/test/java/io/archivesunleashed/data/ArchiveRecordWritableTest.java delete mode 100644 src/test/java/io/archivesunleashed/data/WarcLoaderTest.java diff --git a/README.md b/README.md index a6a384f2..151c3766 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # The Archives Unleashed Toolkit [![codecov](https://codecov.io/gh/archivesunleashed/aut/branch/main/graph/badge.svg)](https://codecov.io/gh/archivesunleashed/aut) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/io.archivesunleashed/aut/badge.svg)](https://maven-badges.herokuapp.com/maven-central/io.archivesunleashed/aut) -[![Javadoc](https://javadoc-badge.appspot.com/io.archivesunleashed/aut.svg?label=javadoc)](http://api.docs.archivesunleashed.io/0.90.2/apidocs/index.html) [![Scaladoc](https://javadoc-badge.appspot.com/io.archivesunleashed/aut.svg?label=scaladoc)](http://api.docs.archivesunleashed.io/0.90.2/scaladocs/index.html) [![UserDocs](https://img.shields.io/badge/UserDocs-0.90.2-brightgreen?style=flat)](https://aut.docs.archivesunleashed.org/docs/home) [![LICENSE](https://img.shields.io/badge/license-Apache-blue.svg?style=flat)](https://www.apache.org/licenses/LICENSE-2.0) diff --git a/pom.xml b/pom.xml index 7c96e581..76438390 100644 --- a/pom.xml +++ b/pom.xml @@ -30,7 +30,6 @@ 1.9.5 2.3 0.12 - 3.1.1 1.6 3.0.0 2.8.2 @@ -178,12 +177,9 @@
config/LICENSE_HEADER.txt
- SLASHSTAR_STYLE SLASHSTAR_STYLE - src/main/java/** - src/test/java/** src/main/scala/** src/test/scala/** @@ -267,34 +263,6 @@ - - org.apache.maven.plugins - maven-javadoc-plugin - ${javadoc.plugin.version} - - ${java.home}/bin/javadoc - true - true - 11 - - - - verify-javadocs - - jar - test-jar - - - - attach-javadocs - - jar - javadoc - - site - - - maven-changelog-plugin ${changelog.plugin.version} @@ -357,13 +325,6 @@ --illegal-access=permit
- - maven-javadoc-plugin - ${javadoc.plugin.version} - - true - - maven-jxr-plugin ${jxr.plugin.version} @@ -597,11 +558,6 @@ - - org.apache.maven.plugins - maven-javadoc-plugin - ${javadoc.plugin.version} - maven-gpg-plugin ${gpg.plugin.version} diff --git a/src/main/java/io/archivesunleashed/data/ArcRecordUtils.java b/src/main/java/io/archivesunleashed/data/ArcRecordUtils.java deleted file mode 100644 index 6b30690c..00000000 --- a/src/main/java/io/archivesunleashed/data/ArcRecordUtils.java +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright © 2017 The Archives Unleashed Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.archivesunleashed.data; - -import java.io.BufferedInputStream; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import java.io.InputStream; -import org.apache.commons.io.IOUtils; -import org.apache.commons.io.input.BoundedInputStream; -import org.apache.log4j.Logger; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCReaderFactory; -import org.archive.io.arc.ARCRecord; -import org.archive.io.arc.ARCRecordMetaData; - -/** Utilities for working with {@code ARCRecord}s (from archive.org APIs). */ -public final class ArcRecordUtils { - - /** Utility classes should not have a public or default constructor. */ - private ArcRecordUtils() {} - - /** Setup logger. */ - private static final Logger LOG = Logger.getLogger(ArcRecordUtils.class); - - /** - * Converts raw bytes into an {@code ARCRecord}. - * - * @param bytes raw bytes - * @return parsed {@code ARCRecord} - * @throws IOException if there is an issue - */ - public static ARCRecord fromBytes(final byte[] bytes) throws IOException { - ARCReader reader = - (ARCReader) - ARCReaderFactory.get( - "", new BufferedInputStream(new ByteArrayInputStream(bytes)), false); - return (ARCRecord) reader.get(); - } - - /** - * Converts ARC record into raw bytes. - * - * @param record conents of WARC response record - * @return raw contents - * @throws IOException if there is an issue - */ - public static byte[] toBytes(final ARCRecord record) throws IOException { - ARCRecordMetaData meta = record.getMetaData(); - - String metaline = - meta.getUrl() - + " " - + meta.getIp() - + " " - + meta.getDate() - + " " - + meta.getMimetype() - + " " - + (int) meta.getLength(); - String versionEtc = ""; - - if (meta.getOffset() == 0) { - versionEtc = - "\n" - + meta.getVersion().replace(".", " ") - + " " - + meta.getOrigin() - + "\n" - + "URL IP-address Archive-date Content-type Archive-length"; - metaline += versionEtc; - } - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutputStream dout = new DataOutputStream(baos); - dout.write(metaline.getBytes()); - dout.write("\n".getBytes()); - - long recordLength = meta.getLength() - versionEtc.length(); - long len = IOUtils.copyLarge(new BoundedInputStream(record, recordLength), dout); - if (len != recordLength) { - LOG.error("Read " + len + " bytes but expected " + recordLength + " bytes. Continuing..."); - } - return baos.toByteArray(); - } - - /** - * Extracts raw contents from an {@code ARCRecord} (including HTTP headers). - * - * @param record the {@code ARCRecord} - * @return raw contents - * @throws IOException if there is an issue - */ - public static byte[] getContent(final ARCRecord record) throws IOException { - ARCRecordMetaData meta = record.getMetaData(); - String versionEtc = ""; - - if (meta.getOffset() == 0) { - versionEtc = - "\n" - + meta.getVersion().replace(".", " ") - + " " - + meta.getOrigin() - + "\n" - + "URL IP-address Archive-date Content-type Archive-length"; - } - - try { - return copyToByteArray(record, (int) meta.getLength() - versionEtc.length(), true); - } catch (Exception e) { - // Catch exceptions related to any corrupt archive files. - return new byte[0]; - } - } - - /** - * Extracts contents of the body from an {@code ARCRecord}. Excludes HTTP headers. - * - * @param record the {@code ARCRecord} - * @return contents of the body - * @throws IOException if there is an issue - */ - public static byte[] getBodyContent(final ARCRecord record) throws IOException { - byte[] raw = getContent(record); - int bodyOffset = record.getBodyOffset(); - - byte[] content = null; - try { - content = new byte[raw.length - bodyOffset]; - System.arraycopy(raw, bodyOffset, content, 0, content.length); - } catch (java.lang.NegativeArraySizeException e) { - // To find out what URL causing the error: record.getMetaData().getUrl() - // For some records, we're missing the actual content data, likely due - // to a crawler gitch. Nothing much we can do, just swallow and move on. - content = new byte[0]; - } - return content; - } - - /** - * Copies contents to a byte array. - * - * @param is raw input stream - * @param recordLength is length of a record - * @param enforceLength enforce the length - * @return rawContents of body - * @throws IOException if there is an issue - */ - private static byte[] copyToByteArray( - final InputStream is, final int recordLength, final boolean enforceLength) - throws IOException { - - BoundedInputStream bis = new BoundedInputStream(is, recordLength); - byte[] rawContents = IOUtils.toByteArray(bis); - if (enforceLength && rawContents.length != recordLength) { - LOG.error( - "Read " - + rawContents.length - + " bytes but expected " - + recordLength - + " bytes. Continuing..."); - } - return rawContents; - } -} diff --git a/src/main/java/io/archivesunleashed/data/ArchiveRecordInputFormat.java b/src/main/java/io/archivesunleashed/data/ArchiveRecordInputFormat.java deleted file mode 100644 index 0639dc83..00000000 --- a/src/main/java/io/archivesunleashed/data/ArchiveRecordInputFormat.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright © 2017 The Archives Unleashed Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.archivesunleashed.data; - -import io.archivesunleashed.data.ArchiveRecordWritable.ArchiveFormat; -import java.io.BufferedInputStream; -import java.io.IOException; -import java.util.Iterator; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.Seekable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.mapreduce.InputSplit; -import org.apache.hadoop.mapreduce.JobContext; -import org.apache.hadoop.mapreduce.RecordReader; -import org.apache.hadoop.mapreduce.TaskAttemptContext; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.FileSplit; -import org.apache.log4j.Logger; -import org.archive.io.ArchiveReader; -import org.archive.io.ArchiveReaderFactory; -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCReaderFactory.CompressedARCReader; -import org.archive.io.warc.WARCReader; -import org.archive.io.warc.WARCReaderFactory.CompressedWARCReader; - -/** Extends FileInputFormat for Web Archive Commons InputFormat. */ -public class ArchiveRecordInputFormat extends FileInputFormat { - /** Setup logger. */ - private static final Logger LOG = Logger.getLogger(ArchiveRecordInputFormat.class); - - @Override - public final RecordReader createRecordReader( - final InputSplit split, final TaskAttemptContext context) - throws IOException, InterruptedException { - return new ArchiveRecordReader(); - } - - @Override - protected final boolean isSplitable(final JobContext context, final Path filename) { - return false; - } - - /** Extends RecordReader for Record Reader. */ - public class ArchiveRecordReader extends RecordReader { - - /** Archive reader. */ - private ArchiveReader reader; - - /** Archive format. */ - private ArchiveFormat format; - - /** Start position of archive being read. */ - private long start; - - /** A given position of an archive being read. */ - private long pos; - - /** End position of an archive being read. */ - private long end; - - /** LongWritable key. */ - private LongWritable key = null; - - /** ArchiveRecordWritable value. */ - private ArchiveRecordWritable value = null; - - /** Archive file name. */ - private String fileName; - - /** Seekable file position. */ - private Seekable filePosition; - - /** Iterator for an archive record. */ - private Iterator iter; - - @Override - public final void initialize( - final InputSplit archiveRecordSplit, final TaskAttemptContext context) throws IOException { - FileSplit split = (FileSplit) archiveRecordSplit; - Configuration job = context.getConfiguration(); - start = split.getStart(); - end = start + split.getLength(); - final Path file = split.getPath(); - - FileSystem fs = file.getFileSystem(job); - FSDataInputStream fileIn = fs.open(split.getPath()); - fileName = split.getPath().toString(); - - reader = ArchiveReaderFactory.get(fileName, new BufferedInputStream(fileIn), true); - - if (reader instanceof ARCReader) { - format = ArchiveFormat.ARC; - iter = reader.iterator(); - } - - if (reader instanceof WARCReader) { - format = ArchiveFormat.WARC; - iter = reader.iterator(); - } - - this.pos = start; - } - - /** - * Determines if archive is compressed. - * - * @return instanceof if ARC/WARC - */ - private boolean isCompressedInput() { - if (format == ArchiveFormat.ARC) { - return reader instanceof CompressedARCReader; - } else { - return reader instanceof CompressedWARCReader; - } - } - - /** - * Get file position of archive. - * - * @return retVal position of archive - * @throws IOException if there is an issue - */ - private long getFilePosition() throws IOException { - long retVal; - if (isCompressedInput() && null != filePosition) { - retVal = filePosition.getPos(); - } else { - retVal = pos; - } - return retVal; - } - - @Override - public final boolean nextKeyValue() throws IOException { - if (!iter.hasNext()) { - return false; - } - - if (key == null) { - key = new LongWritable(); - } - key.set(pos); - - ArchiveRecord record = null; - try { - record = iter.next(); - } catch (Exception e) { - return false; - } - - if (record == null) { - return false; - } - - if (value == null) { - value = new ArchiveRecordWritable(); - } - value.setRecord(record); - - return true; - } - - @Override - public final LongWritable getCurrentKey() { - return key; - } - - @Override - public final ArchiveRecordWritable getCurrentValue() { - return value; - } - - @Override - public final float getProgress() throws IOException { - if (start == end) { - return 0.0f; - } else { - return Math.min(1.0f, (getFilePosition() - start) / (float) (end - start)); - } - } - - @Override - public final synchronized void close() throws IOException { - reader.close(); - LOG.info("Closed archive file " + fileName); - } - } -} diff --git a/src/main/java/io/archivesunleashed/data/ArchiveRecordWritable.java b/src/main/java/io/archivesunleashed/data/ArchiveRecordWritable.java deleted file mode 100644 index 986cc911..00000000 --- a/src/main/java/io/archivesunleashed/data/ArchiveRecordWritable.java +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright © 2017 The Archives Unleashed Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.archivesunleashed.data; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import org.apache.hadoop.io.Writable; -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCRecord; -import org.archive.io.warc.WARCRecord; - -/** Implements Hadoop Writable for Archive Records. */ -public class ArchiveRecordWritable implements Writable { - - /** Archive Formats that can be used. {@link #UNKNOWN} {@link #ARC} {@link #WARC} */ - public enum ArchiveFormat { - /** UNKNOWN format. */ - UNKNOWN, - - /** ARC format. */ - ARC, - - /** WARC format. */ - WARC - } - - /** Set default Record format to UNKNOWN. */ - private ArchiveFormat format = ArchiveFormat.UNKNOWN; - - /** Initialize Archive Record to null. */ - private ArchiveRecord record = null; - - /** Utility function. */ - public ArchiveRecordWritable() {} - - /** - * Initialize Archive Record. - * - * @param r Archive Record - */ - public ArchiveRecordWritable(final ArchiveRecord r) { - this.record = r; - detectFormat(); - } - - /** - * Set Archive Record. - * - * @param r Archive Record - */ - public final void setRecord(final ArchiveRecord r) { - this.record = r; - detectFormat(); - } - - /** - * Get Archive Record. - * - * @return record Archive Record - */ - public final ArchiveRecord getRecord() { - return record; - } - - /** Detect format of Archive Record. */ - public final void detectFormat() { - if (record instanceof ARCRecord) { - format = ArchiveFormat.ARC; - } else if (record instanceof WARCRecord) { - format = ArchiveFormat.WARC; - } else { - format = ArchiveFormat.UNKNOWN; - } - } - - /** - * Get format of Archive Record. - * - * @return format of Archive Record - */ - public final ArchiveFormat getFormat() { - return format; - } - - /** - * Set format of Archive Record. - * - * @param f format of Archive Record - */ - public final void setFormat(final ArchiveFormat f) { - this.format = f; - } - - @Override - public final void readFields(final DataInput in) throws IOException { - int len = in.readInt(); - if (len == 0) { - this.record = null; - return; - } - - byte[] bytes = new byte[len]; - in.readFully(bytes); - - if (getFormat() == ArchiveFormat.ARC) { - this.record = ArcRecordUtils.fromBytes(bytes); - } else if (getFormat() == ArchiveFormat.WARC) { - this.record = WarcRecordUtils.fromBytes(bytes); - } else { - this.record = null; - } - } - - @Override - public final void write(final DataOutput out) throws IOException { - if (record == null) { - out.writeInt(0); - } - byte[] bytes; - - if (getFormat() == ArchiveFormat.ARC) { - bytes = ArcRecordUtils.toBytes((ARCRecord) record); - } else if (getFormat() == ArchiveFormat.WARC) { - bytes = WarcRecordUtils.toBytes((WARCRecord) record); - } else { - bytes = null; - } - - out.writeInt(bytes.length); - out.write(bytes); - } -} diff --git a/src/main/java/io/archivesunleashed/data/WarcRecordUtils.java b/src/main/java/io/archivesunleashed/data/WarcRecordUtils.java deleted file mode 100644 index dfa4b830..00000000 --- a/src/main/java/io/archivesunleashed/data/WarcRecordUtils.java +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright © 2017 The Archives Unleashed Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.archivesunleashed.data; - -import java.io.BufferedInputStream; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.apache.commons.httpclient.HttpParser; -import org.apache.commons.io.IOUtils; -import org.apache.commons.io.input.BoundedInputStream; -import org.apache.log4j.Logger; -import org.archive.format.warc.WARCConstants; -import org.archive.io.warc.WARCReader; -import org.archive.io.warc.WARCReaderFactory; -import org.archive.io.warc.WARCRecord; - -/** Utilities for working with {@code WARCRecord}s (from archive.org APIs). */ -public final class WarcRecordUtils implements WARCConstants { - - /** Utility classes should not have a public or default constructor. */ - private WarcRecordUtils() {} - - /** Setup logger. */ - private static final Logger LOG = Logger.getLogger(WarcRecordUtils.class); - - /** - * Converts raw bytes into an {@code WARCRecord}. - * - * @param bytes raw bytes - * @return parsed {@code WARCRecord} - * @throws IOException if there is an issue - */ - public static WARCRecord fromBytes(final byte[] bytes) throws IOException { - WARCReader reader = - (WARCReader) - WARCReaderFactory.get( - "", new BufferedInputStream(new ByteArrayInputStream(bytes)), false); - return (WARCRecord) reader.get(); - } - - /** - * Converts WARC record into raw bytes. - * - * @param record conents of WARC response record - * @return raw contents - * @throws IOException if there is an issue - */ - public static byte[] toBytes(final WARCRecord record) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutputStream dout = new DataOutputStream(baos); - - dout.write("WARC/0.17\n".getBytes()); - for (Map.Entry entry : record.getHeader().getHeaderFields().entrySet()) { - dout.write((entry.getKey() + ": " + entry.getValue().toString() + "\n").getBytes()); - } - dout.write("\n".getBytes()); - record.dump(dout); - - return baos.toByteArray(); - } - - /** - * Extracts the MIME type of WARC response records. "WARC-Type" is "response". Note that this is - * different from the "Content-Type" in the WARC header. - * - * @param contents raw contents of the WARC response record - * @return MIME type - */ - public static String getWarcResponseMimeType(final byte[] contents) { - // This is a somewhat janky way to get the MIME type of the response. - // Moreover, this simple regex is not compliant with the specification. - // See: https://www.w3.org/Protocols/rfc1341/4_Content-Type.html - // It would be much better to parse all headers using an external library: - // org.apache.commons.httpclient.HeaderElement - // Note that this is different from the "Content-Type" in the WARC header. - Pattern pattern = Pattern.compile("Content-Type: ([^\\s;]+) *(;.*)?", Pattern.CASE_INSENSITIVE); - Matcher matcher = pattern.matcher(new String(contents)); - if (matcher.find()) { - return matcher.group(1).replaceAll(";$", ""); - } - - return null; - } - - /** - * Extracts raw contents from a {@code WARCRecord} (including HTTP headers). - * - * @param record the {@code WARCRecord} - * @return raw contents - * @throws IOException if there is an issue - */ - public static byte[] getContent(final WARCRecord record) throws IOException { - int len = (int) record.getHeader().getContentLength(); - - // If we have a corrupt record, quit and move on. - if (len < 0) { - return new byte[0]; - } - - try { - return copyToByteArray(record, len, true); - } catch (Exception e) { - // Catch exceptions related to any corrupt archive files. - return new byte[0]; - } - } - - /** - * Extracts contents of the body from a {@code WARCRecord}. Excludes HTTP headers. - * - * @param record the {@code WARCRecord} - * @return contents of the body - * @throws IOException if there is an issue - */ - public static byte[] getBodyContent(final WARCRecord record) throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - String line = HttpParser.readLine(record, WARC_HEADER_ENCODING); - if (line == null) { - return null; - } - - // Just using parseHeaders to move down input stream to body. - HttpParser.parseHeaders(record, WARC_HEADER_ENCODING); - record.dump(baos); - return baos.toByteArray(); - } - - /** - * Copies contents to a byte array. - * - * @param is raw input stream - * @param recordLength length of a record - * @param enforceLength enforce the length - * @return rawContents of body - * @throws IOException if there is an issue - */ - private static byte[] copyToByteArray( - final InputStream is, final int recordLength, final boolean enforceLength) - throws IOException { - - BoundedInputStream bis = new BoundedInputStream(is, recordLength); - byte[] rawContents = IOUtils.toByteArray(bis); - if (enforceLength && rawContents.length != recordLength) { - LOG.error( - "Read " - + rawContents.length - + " bytes but expected " - + recordLength - + " bytes. Continuing..."); - } - return rawContents; - } -} diff --git a/src/main/java/io/archivesunleashed/data/package-info.java b/src/main/java/io/archivesunleashed/data/package-info.java deleted file mode 100644 index 13a4b2cc..00000000 --- a/src/main/java/io/archivesunleashed/data/package-info.java +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright © 2017 The Archives Unleashed Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** This package provides various data utilities for analyzing WARCs. */ -package io.archivesunleashed.data; diff --git a/src/test/java/io/archivesunleashed/data/ArcLoaderTest.java b/src/test/java/io/archivesunleashed/data/ArcLoaderTest.java deleted file mode 100644 index d553a1ed..00000000 --- a/src/test/java/io/archivesunleashed/data/ArcLoaderTest.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright © 2017 The Archives Unleashed Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.archivesunleashed.data; - -import static org.junit.Assert.assertEquals; - -import com.google.common.io.Resources; -import java.io.BufferedInputStream; -import java.io.ByteArrayInputStream; -import java.io.DataInputStream; -import java.io.File; -import java.io.InputStream; -import java.util.Iterator; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCReader; -import org.archive.io.arc.ARCReaderFactory; -import org.archive.io.arc.ARCRecord; -import org.archive.io.arc.ARCRecordMetaData; -import org.junit.Test; - -public class ArcLoaderTest { - private static final Log LOG = LogFactory.getLog(ArcLoaderTest.class); - - @Test - public final void testReader() throws Exception { - String[] urls = - new String[] { - "filedesc://IAH-20080430204825-00000-blackbook.arc", - "dns:www.archive.org", - "http://www.archive.org/robots.txt", - "http://www.archive.org/", - "http://www.archive.org/index.php" - }; - - String arcFile = Resources.getResource("arc/example.arc.gz").getPath(); - ARCReader reader = ARCReaderFactory.get(new File(arcFile)); - - int cnt = 0; - final int cntTest = 300; - - for (Iterator ii = reader.iterator(); ii.hasNext(); ) { - ARCRecord r = (ARCRecord) ii.next(); - ARCRecordMetaData meta = r.getMetaData(); - - if (cnt < urls.length) { - assertEquals(urls[cnt], meta.getUrl()); - } - cnt++; - } - reader.close(); - - LOG.info(cnt + " records read!"); - assertEquals(cntTest, cnt); - } - - @Test - public final void testReadFromStream() throws Exception { - String arcFile = Resources.getResource("arc/example.arc.gz").getPath(); - ARCReader reader = ARCReaderFactory.get(new File(arcFile)); - - int cnt = 0; - final int cntTest = 300; - - for (Iterator ii = reader.iterator(); ii.hasNext(); ) { - ARCRecord r = (ARCRecord) ii.next(); - // Skip the file header. - if (cnt == 0) { - cnt++; - continue; - } - - String h = r.getHeaderString(); - InputStream in = new DataInputStream(new ByteArrayInputStream(ArcRecordUtils.toBytes(r))); - - ARCReader nr = (ARCReader) ARCReaderFactory.get("", new BufferedInputStream(in), false); - ARCRecord r2 = (ARCRecord) nr.get(); - - assertEquals(h, r2.getHeaderString()); - cnt++; - } - reader.close(); - - LOG.info(cnt + " records read!"); - assertEquals(cntTest, cnt); - } -} diff --git a/src/test/java/io/archivesunleashed/data/ArchiveRecordInputFormatTest.java b/src/test/java/io/archivesunleashed/data/ArchiveRecordInputFormatTest.java deleted file mode 100644 index f8c8f081..00000000 --- a/src/test/java/io/archivesunleashed/data/ArchiveRecordInputFormatTest.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright © 2017 The Archives Unleashed Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.archivesunleashed.data; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import com.google.common.io.Resources; -import java.io.File; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.mapreduce.InputFormat; -import org.apache.hadoop.mapreduce.RecordReader; -import org.apache.hadoop.mapreduce.TaskAttemptContext; -import org.apache.hadoop.mapreduce.TaskAttemptID; -import org.apache.hadoop.mapreduce.lib.input.FileSplit; -import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; -import org.apache.hadoop.util.ReflectionUtils; -import org.archive.io.ArchiveRecord; -import org.archive.io.arc.ARCRecord; -import org.archive.io.arc.ARCRecordMetaData; -import org.archive.io.warc.WARCRecord; -import org.junit.Test; - -public class ArchiveRecordInputFormatTest { - @Test - public final void testArcInputFormat() throws Exception { - String[] urls = - new String[] { - "filedesc://IAH-20080430204825-00000-blackbook.arc", - "dns:www.archive.org", - "http://www.archive.org/robots.txt", - "http://www.archive.org/", - "http://www.archive.org/index.php" - }; - - String arcFile = Resources.getResource("arc/example.arc.gz").getPath(); - - Configuration conf = new Configuration(false); - conf.set("fs.defaultFS", "file:///"); - - File testFile = new File(arcFile); - Path path = new Path(testFile.getAbsoluteFile().toURI()); - FileSplit split = new FileSplit(path, 0, testFile.length(), null); - - InputFormat inputFormat = - ReflectionUtils.newInstance(ArchiveRecordInputFormat.class, conf); - TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); - RecordReader reader = - inputFormat.createRecordReader(split, context); - - reader.initialize(split, context); - - int cnt = 0; - final int cntTest = 300; - - while (reader.nextKeyValue()) { - ArchiveRecord record = reader.getCurrentValue().getRecord(); - boolean isArc = record instanceof ARCRecord; - assertTrue(isArc); - - if (isArc) { - ARCRecord arcRecord = (ARCRecord) record; - ARCRecordMetaData metadata = arcRecord.getMetaData(); - - if (cnt < urls.length) { - assertEquals(urls[cnt], metadata.getUrl()); - } - } - - cnt++; - } - assertEquals(cntTest, cnt); - } - - @Test - public final void testWarcInputFormat() throws Exception { - String[] urls = - new String[] { - null, - "dns:www.archive.org", - "http://www.archive.org/robots.txt", - "http://www.archive.org/robots.txt", - "http://www.archive.org/robots.txt", - "http://www.archive.org/", - "http://www.archive.org/", - "http://www.archive.org/", - "http://www.archive.org/index.php", - "http://www.archive.org/index.php" - }; - - String[] types = - new String[] { - "warcinfo", - "response", - "response", - "request", - "metadata", - "response", - "request", - "metadata", - "response", - "request" - }; - - String arcFile = Resources.getResource("warc/example.warc.gz").getPath(); - - Configuration conf = new Configuration(false); - conf.set("fs.defaultFS", "file:///"); - - File testFile = new File(arcFile); - Path path = new Path(testFile.getAbsoluteFile().toURI()); - FileSplit split = new FileSplit(path, 0, testFile.length(), null); - - InputFormat inputFormat = - ReflectionUtils.newInstance(ArchiveRecordInputFormat.class, conf); - TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); - RecordReader reader = - inputFormat.createRecordReader(split, context); - - reader.initialize(split, context); - - assertTrue(urls.length == types.length); - - int cnt = 0; - int responseCnt = 0; - final int cntTest = 822; - final int responseCntTest = 299; - - while (reader.nextKeyValue()) { - ArchiveRecord record = reader.getCurrentValue().getRecord(); - boolean isWarc = record instanceof WARCRecord; - assertTrue(isWarc); - - if (isWarc) { - WARCRecord warcRecord = (WARCRecord) record; - if (cnt < urls.length) { - assertEquals(urls[cnt], warcRecord.getHeader().getUrl()); - assertEquals(types[cnt], warcRecord.getHeader().getHeaderValue("WARC-Type")); - } - - if (warcRecord.getHeader().getHeaderValue("WARC-Type").equals("response")) { - responseCnt++; - } - } - - cnt++; - } - assertEquals(cntTest, cnt); - assertEquals(responseCntTest, responseCnt); - } -} diff --git a/src/test/java/io/archivesunleashed/data/ArchiveRecordWritableTest.java b/src/test/java/io/archivesunleashed/data/ArchiveRecordWritableTest.java deleted file mode 100644 index d8c65f96..00000000 --- a/src/test/java/io/archivesunleashed/data/ArchiveRecordWritableTest.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright © 2017 The Archives Unleashed Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.archivesunleashed.data; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import com.google.common.io.Resources; -import io.archivesunleashed.data.ArchiveRecordWritable.ArchiveFormat; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.File; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.mapreduce.InputFormat; -import org.apache.hadoop.mapreduce.RecordReader; -import org.apache.hadoop.mapreduce.TaskAttemptContext; -import org.apache.hadoop.mapreduce.TaskAttemptID; -import org.apache.hadoop.mapreduce.lib.input.FileSplit; -import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; -import org.apache.hadoop.util.ReflectionUtils; -import org.archive.io.arc.ARCRecord; -import org.junit.Test; - -public class ArchiveRecordWritableTest { - @Test - public final void testArcInputFormat() throws Exception { - String arcFile = Resources.getResource("arc/example.arc.gz").getPath(); - - Configuration conf = new Configuration(false); - conf.set("fs.defaultFS", "file:///"); - - File testFile = new File(arcFile); - Path path = new Path(testFile.getAbsoluteFile().toURI()); - FileSplit split = new FileSplit(path, 0, testFile.length(), null); - - InputFormat inputFormat = - ReflectionUtils.newInstance(ArchiveRecordInputFormat.class, conf); - TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); - RecordReader reader = - inputFormat.createRecordReader(split, context); - - reader.initialize(split, context); - - int cnt = 0; - final int cntTest = 300; - - while (reader.nextKeyValue()) { - ArchiveRecordWritable record = reader.getCurrentValue(); - cnt++; - - ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); - DataOutputStream dataOut = new DataOutputStream(bytesOut); - - record.write(dataOut); - - ArchiveRecordWritable reconstructed = new ArchiveRecordWritable(); - - reconstructed.setFormat(ArchiveFormat.ARC); - reconstructed.readFields( - new DataInputStream(new ByteArrayInputStream(bytesOut.toByteArray()))); - - boolean isArc = (record.getFormat() == ArchiveFormat.ARC); - assertEquals(isArc, true); - if (isArc) { - assertEquals( - ((ARCRecord) record.getRecord()).getMetaData().getUrl(), - ((ARCRecord) reconstructed.getRecord()).getMetaData().getUrl()); - } - } - - assertEquals(cntTest, cnt); - } - - @Test - public final void testWarcInputFormat() throws Exception { - String warcFile = Resources.getResource("warc/example.warc.gz").getPath(); - - Configuration conf = new Configuration(false); - conf.set("fs.defaultFS", "file:///"); - - File testFile = new File(warcFile); - Path path = new Path(testFile.getAbsoluteFile().toURI()); - FileSplit split = new FileSplit(path, 0, testFile.length(), null); - - InputFormat inputFormat = - ReflectionUtils.newInstance(ArchiveRecordInputFormat.class, conf); - TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); - RecordReader reader = - inputFormat.createRecordReader(split, context); - - reader.initialize(split, context); - - int cnt = 0; - final int cntTest = 822; - - while (reader.nextKeyValue()) { - ArchiveRecordWritable record = reader.getCurrentValue(); - - cnt++; - - ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); - DataOutputStream dataOut = new DataOutputStream(bytesOut); - - record.write(dataOut); - - ArchiveRecordWritable reconstructed = new ArchiveRecordWritable(); - - reconstructed.setFormat(ArchiveFormat.WARC); - reconstructed.readFields( - new DataInputStream(new ByteArrayInputStream(bytesOut.toByteArray()))); - - boolean isWarc = (record.getFormat() == ArchiveFormat.WARC); - assertTrue(isWarc); - if (isWarc) { - assertEquals( - record.getRecord().getHeader().getUrl(), - reconstructed.getRecord().getHeader().getUrl()); - assertEquals( - record.getRecord().getHeader().getContentLength(), - reconstructed.getRecord().getHeader().getContentLength()); - } - } - - assertEquals(cntTest, cnt); - } -} diff --git a/src/test/java/io/archivesunleashed/data/WarcLoaderTest.java b/src/test/java/io/archivesunleashed/data/WarcLoaderTest.java deleted file mode 100644 index b0fdf28d..00000000 --- a/src/test/java/io/archivesunleashed/data/WarcLoaderTest.java +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright © 2017 The Archives Unleashed Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.archivesunleashed.data; - -import static org.junit.Assert.assertEquals; - -import com.google.common.io.Resources; -import java.io.BufferedInputStream; -import java.io.ByteArrayInputStream; -import java.io.DataInputStream; -import java.io.File; -import java.io.InputStream; -import java.text.SimpleDateFormat; -import java.util.Iterator; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.archive.io.ArchiveRecord; -import org.archive.io.ArchiveRecordHeader; -import org.archive.io.warc.WARCReader; -import org.archive.io.warc.WARCReaderFactory; -import org.archive.io.warc.WARCRecord; -import org.archive.util.ArchiveUtils; -import org.junit.Test; -import tl.lin.data.fd.Object2IntFrequencyDistribution; -import tl.lin.data.fd.Object2IntFrequencyDistributionEntry; - -public class WarcLoaderTest { - private static final Log LOG = LogFactory.getLog(WarcLoaderTest.class); - private static final SimpleDateFormat DATE_WARC = - new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); - - @Test - public final void testReader() throws Exception { - String warcFile = Resources.getResource("warc/example.warc.gz").getPath(); - WARCReader reader = WARCReaderFactory.get(new File(warcFile)); - - Object2IntFrequencyDistribution types = - new Object2IntFrequencyDistributionEntry(); - - Object2IntFrequencyDistribution responseTypes = - new Object2IntFrequencyDistributionEntry(); - - int cnt = 0; - final int cntTest = 822; - final int responseTest = 299; - final int warcinfoTest = 1; - final int requestTest = 261; - final int metadataTest = 261; - final int numberOfEventsTest = 4; - final int typesSumOfCountsTest = 822; - final int mimeTypeJavascriptTest = 8; - final int mimeTypeCssTest = 4; - final int mimeTypeFlashTest = 8; - final int mimeTypeXmlTest = 9; - final int mimeTypePngTest = 8; - final int mimeTypeJpegTest = 18; - final int mimeTypeGifTest = 29; - final int mimeTypePlainTest = 36; - final int mimeTypeHtmlTest = 140; - final int responseSumOfCountsTest = 260; - - for (Iterator ii = reader.iterator(); ii.hasNext(); ) { - WARCRecord r = (WARCRecord) ii.next(); - ArchiveRecordHeader header = r.getHeader(); - - types.increment((String) header.getHeaderValue("WARC-Type")); - - byte[] contents = WarcRecordUtils.getContent(r); - int len = (int) header.getContentLength(); - assertEquals(len, contents.length); - - // This is how you extract the date - @SuppressWarnings("unused") - String digit14Date = ArchiveUtils.get14DigitDate(DATE_WARC.parse(header.getDate())); - - if (header.getHeaderValue("WARC-Type").equals("response") - && header.getUrl().startsWith("http://")) { - responseTypes.increment(WarcRecordUtils.getWarcResponseMimeType(contents)); - } - - cnt++; - } - reader.close(); - - LOG.info(cnt + " records read!"); - assertEquals(cntTest, cnt); - - assertEquals(responseTest, types.get("response")); - assertEquals(warcinfoTest, types.get("warcinfo")); - assertEquals(requestTest, types.get("request")); - assertEquals(metadataTest, types.get("metadata")); - assertEquals(numberOfEventsTest, types.getNumberOfEvents()); - assertEquals(typesSumOfCountsTest, types.getSumOfCounts()); - - assertEquals(mimeTypeJavascriptTest, responseTypes.get("application/x-javascript")); - assertEquals(mimeTypeCssTest, responseTypes.get("text/css")); - assertEquals(mimeTypeFlashTest, responseTypes.get("application/x-shockwave-flash")); - assertEquals(mimeTypeXmlTest, responseTypes.get("text/xml")); - assertEquals(mimeTypePngTest, responseTypes.get("image/png")); - assertEquals(mimeTypeJpegTest, responseTypes.get("image/jpeg")); - assertEquals(mimeTypeGifTest, responseTypes.get("image/gif")); - assertEquals(mimeTypePlainTest, responseTypes.get("text/plain")); - assertEquals(mimeTypeHtmlTest, responseTypes.get("text/html")); - assertEquals(responseSumOfCountsTest, responseTypes.getSumOfCounts()); - } - - @Test - public final void testReadFromStream() throws Exception { - String warcFile = Resources.getResource("warc/example.warc.gz").getPath(); - WARCReader reader = WARCReaderFactory.get(new File(warcFile)); - - int cnt = 0; - final int cntTest = 822; - - for (Iterator ii = reader.iterator(); ii.hasNext(); ) { - WARCRecord r = (WARCRecord) ii.next(); - InputStream in = new DataInputStream(new ByteArrayInputStream(WarcRecordUtils.toBytes(r))); - - WARCReader nr = (WARCReader) WARCReaderFactory.get("", new BufferedInputStream(in), false); - WARCRecord r2 = (WARCRecord) nr.get(); - - assertEquals(r.getHeader().getUrl(), r2.getHeader().getUrl()); - - ArchiveRecordHeader header = r2.getHeader(); - byte[] contents = WarcRecordUtils.getContent(r2); - int len = (int) header.getContentLength(); - assertEquals(len, contents.length); - - cnt++; - } - reader.close(); - - LOG.info(cnt + " records read!"); - assertEquals(cntTest, cnt); - } - - @Test - public final void testContentTypeWithCharset() throws Exception { - String content = "Content-Type: text/html;charset=ISO-8859-1\r\n"; - byte[] contentBytes = content.getBytes("UTF-8"); - String mimeType = WarcRecordUtils.getWarcResponseMimeType(contentBytes); - assertEquals("text/html", mimeType); - } -} From 6c1b9fa5966a8ad6aa79f3e58252615a36bc6387 Mon Sep 17 00:00:00 2001 From: nruest Date: Wed, 11 May 2022 15:16:03 -0400 Subject: [PATCH 02/11] Pull in Sparkling. --- pom.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pom.xml b/pom.xml index e423c2be..1bd0ede6 100644 --- a/pom.xml +++ b/pom.xml @@ -537,6 +537,11 @@ hadoop-aws ${hadoop.version} + + com.github.internetarchive + Sparkling + main-SNAPSHOT + From 20e04c5ee5da5941767f2df8b14cb55befc6808b Mon Sep 17 00:00:00 2001 From: Helge Holzmann Date: Mon, 16 May 2022 14:16:34 +0200 Subject: [PATCH 03/11] first draft to use IA's Sparkling WARC record loader --- .../SparklingArchiveRecord.scala | 51 +++++++++++++ .../scala/io/archivesunleashed/package.scala | 75 +++++++------------ 2 files changed, 77 insertions(+), 49 deletions(-) create mode 100644 src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala diff --git a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala new file mode 100644 index 00000000..8815aaec --- /dev/null +++ b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala @@ -0,0 +1,51 @@ +package io.archivesunleashed + +import java.io.InputStream + +import io.archivesunleashed.matchbox.ExtractDomain +import org.apache.tika.io.BoundedInputStream +import org.archive.webservices.sparkling.http.HttpMessage +import org.archive.webservices.sparkling.io.IOUtil +import org.archive.webservices.sparkling.util.{ManagedVal, ValueSupplier} +import org.archive.webservices.sparkling.warc.{WarcHeaders, WarcRecord} + +object SparklingArchiveRecord { + val MaxStringByteLength: Int = 1024 +} + +class SparklingArchiveRecord(filename: String, meta: WarcRecord, payload: ManagedVal[ValueSupplier[InputStream]], maxMemoryBytes: Long = -1) extends ArchiveRecord { + import SparklingArchiveRecord._ + + def warc: WarcRecord = new WarcRecord(meta.versionStr, meta.headers, payload.get.get) + + private def payload(r: WarcRecord): Array[Byte] = { + IOUtil.bytes(if (maxMemoryBytes < 0) r.payload else new BoundedInputStream(maxMemoryBytes, r.payload)) + } + private def http(r: WarcRecord): Option[HttpMessage] = { + if (maxMemoryBytes < 0) r.http else r.http.map(_.copy(maxBodyLength = maxMemoryBytes)) + } + + def limitBodyLength(maxBodyLength: Long): SparklingArchiveRecord = { + new SparklingArchiveRecord(filename, meta, payload, maxBodyLength) + } + + override def getArchiveFilename: String = filename + override def getCrawlDate: String = meta.timestamp.filter(_.length >= 8).map(_.take(8)).getOrElse("") + override def getCrawlMonth: String = warc.timestamp.filter(_.length >= 6).map(_.take(6)).getOrElse("") + override def getContentBytes: Array[Byte] = payload(warc) + override def getContentString: String = { + val record = if (maxMemoryBytes < 0) limitBodyLength(MaxStringByteLength).warc else warc + http(record).map { http => + new String(WarcHeaders.http(http.statusLine, http.headers)) + http.bodyString + }.getOrElse(new String(payload(record))) + } + override def getMimeType: String = http(warc).flatMap(_.mime).getOrElse("unknown") + override def getUrl: String = warc.url.getOrElse("") + override def getDomain: String = ExtractDomain(getUrl) + override def getBinaryBytes: Array[Byte] = { + var record = warc + http(record).map(_.body).map(IOUtil.bytes).getOrElse(payload(record)) + } + override def getHttpStatus: String = http(warc).map(_.status.toString).getOrElse("000") + override def getPayloadDigest: String = meta.payloadDigest.orElse(warc.digestPayload()).getOrElse("") +} diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index 55805c30..8327f5c5 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -16,55 +16,32 @@ package io +import java.io.InputStream import java.security.MessageDigest import java.util.Base64 import io.archivesunleashed.data.ArchiveRecordWritable.ArchiveFormat -import io.archivesunleashed.data.{ - ArchiveRecordInputFormat, - ArchiveRecordWritable -} - -import ArchiveRecordWritable.ArchiveFormat -import io.archivesunleashed.udfs.{ - detectLanguage, - detectMimeTypeTika, - extractDate, - extractDomain, - removeHTML -} - -import io.archivesunleashed.matchbox.{ - DetectLanguage, - DetectMimeTypeTika, - ExtractDate, - ExtractDomain, - ExtractImageDetails, - ExtractImageLinks, - ExtractLinks, - GetExtensionMIME, - RemoveHTML, - RemoveHTTPHeader -} +import io.archivesunleashed.data.{ArchiveRecordInputFormat, ArchiveRecordWritable} +import io.archivesunleashed.udfs.{detectLanguage, detectMimeTypeTika, extractDate, extractDomain, removeHTML} +import io.archivesunleashed.matchbox.{DetectLanguage, DetectMimeTypeTika, ExtractDate, ExtractDomain, ExtractImageDetails, ExtractImageLinks, ExtractLinks, GetExtensionMIME, RemoveHTML, RemoveHTTPHeader} import io.archivesunleashed.matchbox.ExtractDate.DateComponent import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent import java.net.URI import java.net.URL + import org.apache.commons.codec.binary.Hex import org.apache.commons.io.FilenameUtils import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.LongWritable import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.{lit, udf} -import org.apache.spark.sql.types.{ - BinaryType, - IntegerType, - StringType, - StructField, - StructType -} +import org.apache.spark.sql.types.{BinaryType, IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.{RangePartitioner, SerializableWritable, SparkContext} +import org.archive.webservices.sparkling.io.{HdfsIO, IOUtil} +import org.archive.webservices.sparkling.util.{IteratorUtil, ManagedVal, RddUtil, ValueSupplier} +import org.archive.webservices.sparkling.warc.{WarcLoader, WarcRecord} + import scala.language.postfixOps import scala.reflect.ClassTag import scala.util.matching.Regex @@ -99,22 +76,22 @@ package object archivesunleashed { * @return an RDD of ArchiveRecords for mapping. */ def loadArchives(path: String, sc: SparkContext): RDD[ArchiveRecord] = { - val uri = new URI(path) - val fs = FileSystem.get(uri, sc.hadoopConfiguration) - val p = new Path(path) - sc.newAPIHadoopFile( - getFiles(p, fs), - classOf[ArchiveRecordInputFormat], - classOf[LongWritable], - classOf[ArchiveRecordWritable] - ) - .filter(r => - (r._2.getFormat == ArchiveFormat.ARC) || - ((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader - .getHeaderValue("WARC-Type") - .equals("response")) - ) - .map(r => new ArchiveRecordImpl(new SerializableWritable(r._2))) + RddUtil.loadFilesLocality(path).flatMap { path => + val filename = path.split('/').last + val in = HdfsIO.open(path, decompress = false) + var prev: Option[ManagedVal[ValueSupplier[InputStream]]] = None + IteratorUtil.cleanup(WarcLoader.load(in).map { record => + for (p <- prev) p.clear(false) + val buffered = IOUtil.buffer(lazyEval = true) { out => + IOUtil.copy(record.payload, out) + } + prev = Some(buffered) + new SparklingArchiveRecord(filename, record, buffered) + }, () => { + for (p <- prev) p.clear(false) + in.close() + }) + } } } From c0d2228beacd648fe18dc79c94186292860e275c Mon Sep 17 00:00:00 2001 From: nruest Date: Tue, 17 May 2022 16:02:27 -0400 Subject: [PATCH 04/11] Updates for pulling in Sparkling. --- pom.xml | 41 ++++- .../io/archivesunleashed/ArchiveRecord.scala | 161 ------------------ .../SparklingArchiveRecord.scala | 47 +++-- .../scala/io/archivesunleashed/package.scala | 55 ++++-- .../scala/io/archivesunleashed/ArcTest.scala | 6 +- .../archivesunleashed/ArchiveRecordTest.scala | 42 ++--- .../io/archivesunleashed/RecordRDDTest.scala | 10 +- .../scala/io/archivesunleashed/WarcTest.scala | 2 +- .../archivesunleashed/app/WgetWarcTest.scala | 6 +- 9 files changed, 142 insertions(+), 228 deletions(-) diff --git a/pom.xml b/pom.xml index 1bd0ede6..50bb3c3a 100644 --- a/pom.xml +++ b/pom.xml @@ -24,6 +24,7 @@ 2.12 2.7.4 3.0.1 + 29.0-jre github 3.0 2.5.2 @@ -107,6 +108,13 @@ + + + com.google.common. + com.google.common.shaded. + + + @@ -415,6 +423,21 @@ scala-library ${scala.version} + + commons-codec + commons-codec + 1.12 + + + org.apache.commons + commons-compress + 1.14 + + + com.google.guava + guava + ${guava.version} + org.apache.hadoop hadoop-mapreduce-client-core @@ -460,12 +483,24 @@ org.netpreserve.commons webarchive-commons - 1.1.9 + 1.1.8 org.apache.hadoop hadoop-core + + org.apache.httpcomponents + httpcore + + + org.apache.httpcomponents + httpclient + + + joda-time + joda-time + @@ -539,8 +574,8 @@ com.github.internetarchive - Sparkling - main-SNAPSHOT + Sparkling + main-SNAPSHOT diff --git a/src/main/scala/io/archivesunleashed/ArchiveRecord.scala b/src/main/scala/io/archivesunleashed/ArchiveRecord.scala index 61e07f9b..48c3147e 100644 --- a/src/main/scala/io/archivesunleashed/ArchiveRecord.scala +++ b/src/main/scala/io/archivesunleashed/ArchiveRecord.scala @@ -16,28 +16,6 @@ package io.archivesunleashed -import java.io.ByteArrayInputStream -import java.security.MessageDigest -import java.text.SimpleDateFormat - -import io.archivesunleashed.data.{ - ArcRecordUtils, - WarcRecordUtils, - ArchiveRecordWritable -} -import io.archivesunleashed.matchbox.{ - ComputeMD5, - ExtractDate, - ExtractDomain, - RemoveHTTPHeader -} -import org.apache.commons.httpclient.{Header, HttpParser, StatusLine} -import org.apache.spark.SerializableWritable -import org.archive.io.arc.ARCRecord -import org.archive.io.warc.WARCRecord -import org.archive.util.ArchiveUtils -import scala.util.Try - /** Trait for a record in a web archive. */ trait ArchiveRecord extends Serializable { @@ -74,142 +52,3 @@ trait ArchiveRecord extends Serializable { /** Returns payload digest (SHA1). */ def getPayloadDigest: String } - -/** Default implementation of a record in a web archive. - * - * @constructor an archive record. - * @param r the serialized record - */ -class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) - extends ArchiveRecord { - val recordFormat = r.t.getFormat - val ISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX") - - val getArchiveFilename: String = { - if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { - r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getReaderIdentifier() - } else { - r.t.getRecord.asInstanceOf[WARCRecord].getHeader.getReaderIdentifier() - } - } - - val getCrawlDate: String = { - if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { - ExtractDate( - r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getDate, - ExtractDate.DateComponent.YYYYMMDDHHMMSS - ) - } else { - ExtractDate( - ArchiveUtils.get14DigitDate( - ISO8601.parse( - r.t.getRecord.asInstanceOf[WARCRecord].getHeader.getDate - ) - ), - ExtractDate.DateComponent.YYYYMMDDHHMMSS - ) - } - } - - val getCrawlMonth: String = { - if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { - ExtractDate( - r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getDate, - ExtractDate.DateComponent.YYYYMM - ) - } else { - ExtractDate( - ArchiveUtils.get14DigitDate( - ISO8601.parse( - r.t.getRecord.asInstanceOf[WARCRecord].getHeader.getDate - ) - ), - ExtractDate.DateComponent.YYYYMM - ) - } - } - - val getContentBytes: Array[Byte] = { - if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { - ArcRecordUtils.getContent(r.t.getRecord.asInstanceOf[ARCRecord]) - } else { - WarcRecordUtils.getContent(r.t.getRecord.asInstanceOf[WARCRecord]) - } - } - - val getContentString: String = { - new String(getContentBytes) - } - - val getMimeType: String = { - if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { - Option(r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getMimetype) - .getOrElse("unknown") - } else { - Option(WarcRecordUtils.getWarcResponseMimeType(getContentBytes)) - .getOrElse("unknown") - } - } - - val getUrl: String = { - if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { - r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getUrl - } else { - r.t.getRecord - .asInstanceOf[WARCRecord] - .getHeader - .getUrl - .replaceAll("<|>", "") - } - } - - val getHttpStatus: String = { - if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { - Option(r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getStatusCode) - .getOrElse("000") - } else { - Try( - new StatusLine( - new String( - HttpParser.readRawLine(new ByteArrayInputStream(getContentBytes)) - ) - ).getStatusCode - ).toOption match { - case Some(x) => x.toString - case None => "000" - } - } - } - - val getDomain: String = { - ExtractDomain(getUrl) - } - - val getBinaryBytes: Array[Byte] = { - if (getContentString.startsWith("HTTP/")) { - getContentBytes.slice( - getContentString.indexOf(RemoveHTTPHeader.headerEnd) - + RemoveHTTPHeader.headerEnd.length, - getContentBytes.length - ) - } else { - getContentBytes - } - } - - val getPayloadDigest: String = { - if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { - "sha1:" + MessageDigest - .getInstance("SHA1") - .digest(getContentBytes) - .map("%02x".format(_)) - .mkString - } else { - r.t.getRecord - .asInstanceOf[WARCRecord] - .getHeader - .getHeaderValue("WARC-Payload-Digest") - .asInstanceOf[String] - } - } -} diff --git a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala index 8815aaec..9a14aed0 100644 --- a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala +++ b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala @@ -13,16 +13,26 @@ object SparklingArchiveRecord { val MaxStringByteLength: Int = 1024 } -class SparklingArchiveRecord(filename: String, meta: WarcRecord, payload: ManagedVal[ValueSupplier[InputStream]], maxMemoryBytes: Long = -1) extends ArchiveRecord { +class SparklingArchiveRecord( + filename: String, + meta: WarcRecord, + payload: ManagedVal[ValueSupplier[InputStream]], + maxMemoryBytes: Long = -1 +) extends ArchiveRecord { import SparklingArchiveRecord._ - def warc: WarcRecord = new WarcRecord(meta.versionStr, meta.headers, payload.get.get) + def warc: WarcRecord = + new WarcRecord(meta.versionStr, meta.headers, payload.get.get) private def payload(r: WarcRecord): Array[Byte] = { - IOUtil.bytes(if (maxMemoryBytes < 0) r.payload else new BoundedInputStream(maxMemoryBytes, r.payload)) + IOUtil.bytes( + if (maxMemoryBytes < 0) r.payload + else new BoundedInputStream(maxMemoryBytes, r.payload) + ) } private def http(r: WarcRecord): Option[HttpMessage] = { - if (maxMemoryBytes < 0) r.http else r.http.map(_.copy(maxBodyLength = maxMemoryBytes)) + if (maxMemoryBytes < 0) r.http + else r.http.map(_.copy(maxBodyLength = maxMemoryBytes)) } def limitBodyLength(maxBodyLength: Long): SparklingArchiveRecord = { @@ -30,22 +40,31 @@ class SparklingArchiveRecord(filename: String, meta: WarcRecord, payload: Manage } override def getArchiveFilename: String = filename - override def getCrawlDate: String = meta.timestamp.filter(_.length >= 8).map(_.take(8)).getOrElse("") - override def getCrawlMonth: String = warc.timestamp.filter(_.length >= 6).map(_.take(6)).getOrElse("") + override def getCrawlDate: String = + meta.timestamp.filter(_.length >= 14).map(_.take(14)).getOrElse("") + override def getCrawlMonth: String = + warc.timestamp.filter(_.length >= 6).map(_.take(6)).getOrElse("") override def getContentBytes: Array[Byte] = payload(warc) override def getContentString: String = { - val record = if (maxMemoryBytes < 0) limitBodyLength(MaxStringByteLength).warc else warc - http(record).map { http => - new String(WarcHeaders.http(http.statusLine, http.headers)) + http.bodyString - }.getOrElse(new String(payload(record))) + val record = + if (maxMemoryBytes < 0) limitBodyLength(MaxStringByteLength).warc + else warc + http(record) + .map { http => + new String(WarcHeaders.http(http.statusLine, http.headers)) + http.bodyString + } + .getOrElse(new String(payload(record))) } - override def getMimeType: String = http(warc).flatMap(_.mime).getOrElse("unknown") - override def getUrl: String = warc.url.getOrElse("") + override def getMimeType: String = + http(warc).flatMap(_.mime).getOrElse("unknown") + override def getUrl: String = warc.url.getOrElse("").replaceAll("<|>", "") override def getDomain: String = ExtractDomain(getUrl) override def getBinaryBytes: Array[Byte] = { var record = warc http(record).map(_.body).map(IOUtil.bytes).getOrElse(payload(record)) } - override def getHttpStatus: String = http(warc).map(_.status.toString).getOrElse("000") - override def getPayloadDigest: String = meta.payloadDigest.orElse(warc.digestPayload()).getOrElse("") + override def getHttpStatus: String = + http(warc).map(_.status.toString).getOrElse("000") + override def getPayloadDigest: String = + meta.payloadDigest.orElse(warc.digestPayload()).getOrElse("") } diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index 8327f5c5..cbab7eeb 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -20,10 +20,18 @@ import java.io.InputStream import java.security.MessageDigest import java.util.Base64 -import io.archivesunleashed.data.ArchiveRecordWritable.ArchiveFormat -import io.archivesunleashed.data.{ArchiveRecordInputFormat, ArchiveRecordWritable} -import io.archivesunleashed.udfs.{detectLanguage, detectMimeTypeTika, extractDate, extractDomain, removeHTML} -import io.archivesunleashed.matchbox.{DetectLanguage, DetectMimeTypeTika, ExtractDate, ExtractDomain, ExtractImageDetails, ExtractImageLinks, ExtractLinks, GetExtensionMIME, RemoveHTML, RemoveHTTPHeader} +import io.archivesunleashed.matchbox.{ + DetectLanguage, + DetectMimeTypeTika, + ExtractDate, + ExtractDomain, + ExtractImageDetails, + ExtractImageLinks, + ExtractLinks, + GetExtensionMIME, + RemoveHTML, + RemoveHTTPHeader +} import io.archivesunleashed.matchbox.ExtractDate.DateComponent import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent import java.net.URI @@ -32,14 +40,24 @@ import java.net.URL import org.apache.commons.codec.binary.Hex import org.apache.commons.io.FilenameUtils import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.hadoop.io.LongWritable import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.{lit, udf} -import org.apache.spark.sql.types.{BinaryType, IntegerType, StringType, StructField, StructType} +import org.apache.spark.sql.types.{ + BinaryType, + IntegerType, + StringType, + StructField, + StructType +} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.{RangePartitioner, SerializableWritable, SparkContext} import org.archive.webservices.sparkling.io.{HdfsIO, IOUtil} -import org.archive.webservices.sparkling.util.{IteratorUtil, ManagedVal, RddUtil, ValueSupplier} +import org.archive.webservices.sparkling.util.{ + IteratorUtil, + ManagedVal, + RddUtil, + ValueSupplier +} import org.archive.webservices.sparkling.warc.{WarcLoader, WarcRecord} import scala.language.postfixOps @@ -80,17 +98,20 @@ package object archivesunleashed { val filename = path.split('/').last val in = HdfsIO.open(path, decompress = false) var prev: Option[ManagedVal[ValueSupplier[InputStream]]] = None - IteratorUtil.cleanup(WarcLoader.load(in).map { record => - for (p <- prev) p.clear(false) - val buffered = IOUtil.buffer(lazyEval = true) { out => - IOUtil.copy(record.payload, out) + IteratorUtil.cleanup( + WarcLoader.load(in).map { record => + for (p <- prev) p.clear(false) + val buffered = IOUtil.buffer(lazyEval = true) { out => + IOUtil.copy(record.payload, out) + } + prev = Some(buffered) + new SparklingArchiveRecord(filename, record, buffered) + }, + () => { + for (p <- prev) p.clear(false) + in.close() } - prev = Some(buffered) - new SparklingArchiveRecord(filename, record, buffered) - }, () => { - for (p <- prev) p.clear(false) - in.close() - }) + ) } } } diff --git a/src/test/scala/io/archivesunleashed/ArcTest.scala b/src/test/scala/io/archivesunleashed/ArcTest.scala index 4da630b2..2efcade9 100644 --- a/src/test/scala/io/archivesunleashed/ArcTest.scala +++ b/src/test/scala/io/archivesunleashed/ArcTest.scala @@ -48,7 +48,7 @@ class ArcTest extends FunSuite with BeforeAndAfter { val dayMonthTestA = "200805" test("Count records") { - assert(RecordLoader.loadArchives(arcPath, sc).count == 300L) + assert(RecordLoader.loadArchives(arcPath, sc).count == 299L) } test("Filter date RDD") { @@ -80,14 +80,14 @@ class ArcTest extends FunSuite with BeforeAndAfter { .loadArchives(arcPath, sc) .discardUrlPatterns(Set("http://www.archive.org/about/.*".r)) assert(keepMatches.count == 16L) - assert(discardMatches.count == 284L) + assert(discardMatches.count == 283L) } test("Count links RDD") { val links = RecordLoader .loadArchives(arcPath, sc) .map(r => ExtractLinks(r.getUrl, r.getContentString)) - assert(links.count == 300L) + assert(links.count == 299L) } test("Detect language RDD") { diff --git a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala index f9c61626..1c921ad1 100644 --- a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala +++ b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala @@ -47,8 +47,8 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { } test("Count records") { - assert(RecordLoader.loadArchives(arcPath, sc).count == 300L) - assert(RecordLoader.loadArchives(warcPath, sc).count == 299L) + assert(RecordLoader.loadArchives(arcPath, sc).count == 299L) + assert(RecordLoader.loadArchives(warcPath, sc).count == 822L) } test("Resource name produces expected result") { @@ -76,10 +76,10 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { .map(x => x.getCrawlDate) .take(3) assert( - textSampleArc.deep == Array(exampleDate, exampleDate, exampleDate).deep + textSampleArc.deep == Array(exampleDate, exampleDate, "20080430204826").deep ) assert( - textSampleWarc.deep == Array(exampleDate, exampleDate, "20080430204826").deep + textSampleWarc.deep == Array(exampleDate, exampleDate, exampleDate).deep ) } @@ -92,8 +92,8 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { .loadArchives(warcPath, sc) .map(x => x.getDomain) .take(3) - assert(textSampleArc.deep == Array("", "", exampleUrl).deep) - assert(textSampleWarc.deep == Array("", exampleUrl, exampleUrl).deep) + assert(textSampleArc.deep == Array("", exampleUrl, exampleUrl).deep) + assert(textSampleWarc.deep == Array("", "", exampleUrl).deep) } test("URLs") { @@ -107,16 +107,16 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { .take(3) assert( textSampleArc.deep == Array( - "filedesc://IAH-20080430204825-00000-blackbook.arc", "dns:www.archive.org", - "http://www.archive.org/robots.txt" + "http://www.archive.org/robots.txt", + "http://www.archive.org/" ).deep ) assert( textSampleWarc.deep == Array( + "", "dns:www.archive.org", - "http://www.archive.org/robots.txt", - "http://www.archive.org/" + "http://www.archive.org/robots.txt" ).deep ) } @@ -132,13 +132,13 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { .take(3) assert( textSampleArc.deep == Array( + "unknown", exampleMimeType, - "text/dns", - exampleMimeType + "text/html" ).deep ) assert( - textSampleWarc.deep == Array("unknown", exampleMimeType, "text/html").deep + textSampleWarc.deep == Array("unknown", "unknown", exampleMimeType).deep ) } @@ -154,14 +154,14 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { assert( textSampleArc.deep == Array( exampleStatusCode1, - exampleStatusCode1, + exampleStatusCode2, exampleStatusCode2 ).deep ) assert( textSampleWarc.deep == Array( exampleStatusCode1, - exampleStatusCode2, + exampleStatusCode1, exampleStatusCode2 ).deep ) @@ -178,16 +178,16 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { .take(3) assert( textSampleArc.deep == Array( - "sha1:252efd6dd414d91812dd9b0f897cdb2b44f64601", - "sha1:8d115d0e83c5dcd66b13619e04d60a36cb2c1ee4", - "sha1:ede22581685942721c7b9743dced317633d00e33" + "sha1:RUIV2DUDYXONM2YTMGPAJVQKG3FSYHXE", + "sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U", + "sha1:2WAXX5NUWNNCS2BDKCO5OVDQBJVNKIVV" ).deep ) assert( textSampleWarc.deep == Array( - null, - "sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U", - "sha1:2WAXX5NUWNNCS2BDKCO5OVDQBJVNKIVV" + "sha1:B3CPX3Q4JK373UZA6HDKGYZVSNQDTGFQ", + "sha1:RUIV2DUDYXONM2YTMGPAJVQKG3FSYHXE", + "sha1:sucgmuvxdkvb5cs2nl4r4jabnx7k466u" ).deep ) } diff --git a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala index cef0a209..c3faa9cc 100644 --- a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala @@ -167,9 +167,9 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { .take(3) assert( r2.deep == Array( - "filedesc://IAH-20080430204825-00000-blackbook.arc", "http://www.archive.org/robots.txt", - "http://www.archive.org/images/logoc.jpg" + "http://www.archive.org/images/logoc.jpg", + "http://ia331306.us.archive.org/robots.txt" ).deep ) } @@ -211,9 +211,9 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { .take(3) assert( r2.deep == Array( - "filedesc://IAH-20080430204825-00000-blackbook.arc", "http://www.archive.org/", - "http://www.archive.org/index.php" + "http://www.archive.org/index.php", + "http://www.archive.org/images/go-button-gateway.gif" ).deep ) } @@ -247,7 +247,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { } test("Discard HTTP status codes RDD") { - val expected = 46 + val expected = 45 val base = RecordLoader.loadArchives(arcPath, sc) val statusCodes: Set[String] = Set("200", "404") val r2 = base.discardHttpStatus(statusCodes).count diff --git a/src/test/scala/io/archivesunleashed/WarcTest.scala b/src/test/scala/io/archivesunleashed/WarcTest.scala index 44b9879f..03691249 100644 --- a/src/test/scala/io/archivesunleashed/WarcTest.scala +++ b/src/test/scala/io/archivesunleashed/WarcTest.scala @@ -42,7 +42,7 @@ class WarcTest extends FunSuite with BeforeAndAfter { } test("Count records") { - assert(299L == records.count) + assert(822L == records.count) } test("WARC extract domain RDD") { diff --git a/src/test/scala/io/archivesunleashed/app/WgetWarcTest.scala b/src/test/scala/io/archivesunleashed/app/WgetWarcTest.scala index 9c90e440..56c2b237 100644 --- a/src/test/scala/io/archivesunleashed/app/WgetWarcTest.scala +++ b/src/test/scala/io/archivesunleashed/app/WgetWarcTest.scala @@ -54,9 +54,9 @@ class WgetWarcTest extends FunSuite with BeforeAndAfter { assert(dfResults.length == RESULTSLENGTH) assert(dfResults(0).get(0) == "20210511181400") - assert(dfResults(0).get(1) == "http://www.archiveteam.org/") - assert(dfResults(1).get(0) == "20210511181401") - assert(dfResults(1).get(1) == "https://wiki.archiveteam.org/") + assert(dfResults(0).get(1) == "") + assert(dfResults(1).get(0) == "20210511181400") + assert(dfResults(1).get(1) == "http://www.archiveteam.org/") } after { From 80584841b826e70005e50637ba3abc161c712891 Mon Sep 17 00:00:00 2001 From: nruest Date: Wed, 18 May 2022 14:18:57 -0400 Subject: [PATCH 05/11] Sparkling integration updates. * fix discardDate issue * update tests for #494 * add test for #493 * add test for #532 * move issue specific tests to their own directory * add copyright statement to SparklingArchiveRecord * move webarchive-commons back to 1.1.9 * resolves #532 * resolves #494 * resolves #493 * resolves #492 * resolves #317 * resolves #260 * resolves #182 * resolves #76 * resolves #74 * resolves #73 * resolves #23 * resolves #18 --- pom.xml | 4 +- .../SparklingArchiveRecord.scala | 16 +++++ .../scala/io/archivesunleashed/package.scala | 7 +- src/test/resources/warc/issue-493.warc | Bin 0 -> 57275 bytes .../io/archivesunleashed/RecordRDDTest.scala | 34 +++------ .../issues/Issue493Test.scala | 66 ++++++++++++++++++ .../{app => issues}/WgetWarcTest.scala | 0 7 files changed, 99 insertions(+), 28 deletions(-) create mode 100644 src/test/resources/warc/issue-493.warc create mode 100644 src/test/scala/io/archivesunleashed/issues/Issue493Test.scala rename src/test/scala/io/archivesunleashed/{app => issues}/WgetWarcTest.scala (100%) diff --git a/pom.xml b/pom.xml index 50bb3c3a..b3dbb1b6 100644 --- a/pom.xml +++ b/pom.xml @@ -107,14 +107,12 @@ META-INF/services/org.apache.lucene.codecs.Codec - com.google.common. com.google.common.shaded. - @@ -483,7 +481,7 @@ org.netpreserve.commons webarchive-commons - 1.1.8 + 1.1.9 org.apache.hadoop diff --git a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala index 9a14aed0..a795feda 100644 --- a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala +++ b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala @@ -1,3 +1,19 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package io.archivesunleashed import java.io.InputStream diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index cbab7eeb..754a6542 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -814,8 +814,11 @@ package object archivesunleashed { * * @param date a list of dates */ - def discardDate(date: String): RDD[ArchiveRecord] = { - rdd.filter(r => r.getCrawlDate != date) + def discardDate( + dates: List[String], + component: DateComponent = DateComponent.YYYYMMDD + ): RDD[ArchiveRecord] = { + rdd.filter(r => !dates.contains(ExtractDate(r.getCrawlDate, component))) } /** Filters detected URLs. diff --git a/src/test/resources/warc/issue-493.warc b/src/test/resources/warc/issue-493.warc new file mode 100644 index 0000000000000000000000000000000000000000..0073d07b3c0b351cedd86e6a1e34aa0ec53902b8 GIT binary patch literal 57275 zcmeEv2UHYU(>9`HQBks@Ly$bch~y+H85AW45l}&bh>A*7 z5S1V*B2fetMWXsQsJp8#xa+$6ecyl1%h7XY`gZr+TU}L8Ro$v;Yc(?sd5D|>C%ZN9 z32fo-K~@C$;ysB}H;Vh0gJxu+yC(^Z)l>v2dwaSmdV5nziYPpZOd=x4U@{s?0K-vu z7#I&Hz`zs*JQ)drqtGx6RQ1aZw5c?*8{QSTvn$@k1@GzZM)e{(fhpTHVB6I-Ji%^M zGKoe7jzATl3JORl3Ianwp(we!<2^ipd*SI+cQ^UP8-OWPGL7W*`)LMbH%GdYA_xY zrKOTRK_)ajo#O843i3h7DJ+s_@e7^~oCJ^ww74LU_RScoWq#INIY{8uD&R*_rU{^97Pr}pjilBWWbl?eyD1tgT8Sw-}26C1Vf>BmqnykzimrfkXzQp)dlN2uGseXfm8aMic&TGZsVvh5p~>EO`Re z#T~VGFWiTKKq28M?;o(*XQrpRIex=w=43A~fcrocPj}Zp2p(ur@F0p%_~OZbWUlY{ zZdoQ;;>cy02#Nu2_eJ`?;+KY2h0=arQE<&No@2ma_^I!2Bjs);~>1Ryv{xo+y39Ly4lmvj$UQT$3BGy>n z+E&vNhD9Qwn${*Ts1_7yql+|#A`ETeruqh7u2rYG6J5T&(#+P%3Z;p(G&44|(KawO z)YmrDG{lMk6p_$l{`ug*A;eENo1)K)M!&1|Snl zbpxyhNCYe|kJYl!mY3JG(EL0Cl~a(H*D?}OWmjIjpQ^GG8BYSfEU~icvLsQK|2)X9 zEWdQJGQpkXzj!@F^*_i5oB{!cJX9^5$RLtC(c2ZED#!`%1(;Whlvc)roIJ@Cm7m7| zRVT71Sy>*hDyJ;(0sMsg(ky`ATU^D;c8e!@uQ$M1Us%R$IeQoeCtx5@k^&f|00qnz zG=>bupy7BhUV)6lpouUv8G-&El)Pm{0)~Y{bz$a6BO`57sG+$P4u!MTw^6e(*T5l6 z)ot{?tPc=0qPM5#l1c6UN1Oj=w97Jb1=Y4RGO|MH0CP|`P}f1~U=1|1wDq6}O*0#) z2@Lf;zy3%me0d(<>jVT8xC=&cRvd)AinO4dSZFgfs-8vHY4L{V5|qA zwa|dpQjkMJ<)ARYsI&AUdxF&*0n#V}QXfF2(eUzsg8-7mK_E&XOM-CGG{Ky~x> z1NotmyOD4ukf)C#3OGj!q(dgUxXVKoAPT@g2;f8jJnZf#zc|Rru0|x1J%GFZCZw`| zT)y8TgUa~1(!M{Z)a+`5#33EA*cv)44`_*0_RfzKM3#%2%enmW@HK=Jb=3TYIzX> zKMpt~=jlWZaCarkxqCXwFZw_7e>9)|)64r0(^USm-3JKaPlAx<;$r|bm!*fJC*Y2P zu>d*(oJ$5C-x7ZSsI#Qk7Fl8uDE`AA`5j!AaAaA*Mj$CrJOuH#AoZ`Pa5r~4*@Y(O zNTqxczhzxrh$0;ELs$1Nqox7?zAK}qx|NxRrnaVunl3^Yr>$j$G}bq^(t(;lHO#Da zjn&{^u3bjD4UM$`A+|NbS->oD76zu~mikr(R+gsfP<<%Q8jJe|H335eWUT)uFbPMj z0F!txI@l0kRKT+*DT1^;sj?siIDn#_0LUu90GCw}f>A_ZK3}b6fd}|mgg^$v2zV3` zPsSth2%>_>Z%l6nhzA*96@ovVyiAw?f&rxStDgA`q@Sg2>4^cl2CI!m;WsnD7%ng@ z{&8_}iHL~6U@%QhO-oBl3Weh9>l+ppmYkeiP*6}^UER{s^7!%NiHV6%pFYVhUa@%X z>NQ@k8?mF6mtUxySFxA$Kiv?=w|icv;=1^2BjZ}#-BnSl0s2HLD3X2QUQhm!^sE=7>&n}fQS&_s#8cL z3I+*V3G)7N$yk}Ce--t9Mgc4%7g$SkO{|`+G0H$&*VNPmVU2=WSs~3$a3;o9CN?%& z-!s=A)&BQFy#ngN5P%B*9`%0LEB{~M-nR($n{fU^*!2}D{sY67Hvzu29hSVrMFRgo zSbV5n)F0Z1a0rQnf}tp2I6^@I42Qr;U_1qZ2NU2B0+fP>QplJU?Zdw!EWVIGhY$q% zt7B&3=}rM+n7=uGK>Wl50K-LpWC_L>1E-5<2|)AG%H*jez~%wMsx+|27iWvK}ddw{11m34vwp>p8Y+&%}56+vGH0k87!z$Fw7f&X2=RDdFX z44FTWqdx^55(a@rQ{WIV2~9u&as*QV_LPDK8;@9@jKxB>_ML5`M- zJp3}?kb#{PIIyt45X4!AjZxYr+PeCdYSuV?Q>3LD${J?{wNleCGt$SJnHj(ge};`$ z0v!0#*2#Bn){+MLFW9jE64)$7Q9nb?XHB(?)AFZagNGp~co+-0zn`!WFmz0FM|zKQ33N`u=xSlm}+a=U~RQ+^{r4wx>&5Sy1tHqwz;Xcp%GkP zABy_l0-MF>_Fu61y2*_o5fEq;5XGY)$Usn!OhAGONDS~53WJlNz=jEw4EYrg=g*Bc z;CcLRwEY`kgM^bIWB@z=CfI1%pwx7r>ShKAoG#Q5u5D^!V~K*o)vVP`Ea7H)KgGuX z1sfoi_p?#CUjmz@UB+Jp8x#dj!sF3^F-9Z;?hOWk0;AyyK)eh}AmE{J0*XXZ_!S}f z-I4ubx-V;${oe(g|Cs-;dN_YSMh7$10CG7D;c#1PxVD9kjgbxtWd$|FnE;!Bn#Nf3 zpD_(r_HY2>|K~iM{}s^r*8+mS;@u#SNCgNQh6iIvivk75!@&eHkXi*pBT0A^g#hf~ zBL7EZ>0bvOCWr;O0(b812#_C!bX_{->KoRD8W^g?%4GVpoB|=B{XSMa>ivI^(1o$`GHx%O6 zXcPjEz@rtwiyM?H?%`PBJ^g{Silr^p-?;d9aQl|8e?R_PkWS%0_M(5=mR+RQwn)~+ zeH#Xb)hf}3>yId`4SBT?k@H0B)#k?g(t=$Z(^z5y*V>$nGl+Wgv>D!$wSWEoxs%z7 zcaKXFq{0n#uXGA*-|v4V;ef|`2j4c;C>BRUuM6I{`{lVx(=IYddG=ox#Rj((?8M3B zW1VAI;@PD}S^G7ai~<+-m71l(ihNw+^N&2I%}W{Jw(zdXi{2c4+DLDthKf;*c1d{a z_EMr?@>5LGxV*YMPf@Y?&ExE6PoD3IynwvW!5Pv&6yFh+bEeVf1qW)P{mz>Y zJy@Y9;i2HYnPUelX&jImW@E%9hvmYw++Wd`Rp50RM~Sn$yWOI9j7weo8pyE z>!f52`EFXj53F{4I6S=F&l){ba(nu)ZUg^JSCU_nhEyeyEzw^fBd@}LbXK4-92+6} zBC(3qM4BGK9&x#;=Y=!7-}b_))O-3mcKaeXtTm%~%d*HM7b(kLS|HB1ywSfx*}qHR zTxm$wPd!7NvB zzw`Uk*Di8hmEMxdBBxNXY1HcCJ}!6dE1PG1-pzA)=w1_LON0k*+^cs}MxcVeu+h_? z5hm4nF>sTYQL7SW`rX1Nx=Fi6b4vU?^CiItWL(hRD@;h}J)EI&yb#kgrpMlY-z84V z3b{VRU>V4|yA`-WQhV91L%o~dWVdHSV)yNzpl3Z0#vF`s8GkI_c0qms68hr((~%Q- zpAJ-B33PgmYQB2VDsW+b;f>~riS>H|l9_Zf{L{I^G7e;K$MqdpyAj#cUb9^4%Jec=9M)~Y? zf2{Z#sVoz%hZ}n0G;;C=0!rbdn@{e-+MM<=w8|ME>8QScPEg)TdA-9CfL@juTEU(tJC zia~w5Klzn*|J|YdwH?aEnu$eN`+?i=)yG}gbq5N($S4@%FYR&v9qVGzZ}^h8vowii zW5z&ICHx;rszf4zWc!r`4vtb&*U^QV8lyDKZH$?M`O8WwA4(+YW3{?FOSfH&?#VGv zp&mSV%;E&|^@lg!@9Qd2+9fqzxwGMM&C#1Kw!G{OD%r1KTKPT!tS#c^dhHt^1TLLW zvq&xm1?$TjZ~A*H7x(C!??$n7?VsmO>2Wun8mb#P)-G|k9jo9OlSXoC)3df=EP$?y zem+pylnuGOW;Oy@(5@Dm9sltt+}zT}#^{#u2a&$lq88C71|T#>GTy<>glXpeHkOoV zcg3f-hVRywzxQ~btN&V!BHpFAM!Ti)qNc7)Vf*A7?u0>4VM~13W?S8JEZiN{4ZOFw zX^^{ns5}xo)}PpaMf4akZZO7plb`&>=2?y7^$GU0FrT25@#Y{?JcAR{{6+(Hw>MI+ zXDCYPTdH7OMXIl!^|+y_a_KuxF(01Fems{Tal};MPISMq$nn^-_nr4ozUe=|UvRiP zw&GLk?(V@iOe~DQ7FfW;&`@COl?=ucfV@&TkRt~s5GXL9rU{0^lgU6i69N7!NcuAh zVr9wx)yTrnsKI5_0}5ejYiMZ;XctSI8q!!z(?Ht*htkxxg~E|YBMZQ9`Bo5ql(Dt~ zl)~XlDfHh$>2JjQ{{l;Yi`{)&Jn?^^q~e!i=$~=;->rQ>0cB(a81dhV0sMLX!#`H{ z08v0dzE&?R8w2?Jd86i5uEzvWM26eG_INo!FA|mMvN@_kO`tM5}W}Z}Bw8DQ@8B{`1~ZQ9P2r-P=ff zbn)I+Lc{UEy@f-wCJZ{q1s=`aA?DrVVBeRXL<=)0KA@BRc2tS`%>5Ud0VU2A9F!9r z+Kfrh$~>bGg~88zeL3CFM88oy2fj3B^LF&3Jmb!3W}I$&B010LaN>m--NPEJvq_1% z;oXR&MvPxK8yk#m&bMDDBE=cE>;%sl(5VgmUl*SyeFCJfv$*(slck~Tw}LzDXOjur$CGG~vVOotFIPfxQm@q+a!9W3{iBH68_x5wiWOF{d6`wlzz(|= zEjX-A-!)UaLC}LTtRoq8fzK+B|AvU)MOK^aLrJ{U?)EMoX5Vh=8#PQn=0)7>O^{vz zd)&KCW$a_pk(;Umyyk87otcvIlmddJ7pgg&()7}bdTo|nL35OMc%Jar71U_3h--WK z0V{A;Jg&dF7z8nQ!+S;ATzoh5V%pvTbHPWe6V;(`o_NFqyaA`&M zTee3p*f$40?zeK?Jk|gBN-HPK=#Jolm?w72uJ2_OAv||DDJ|(Md%AIOOAR&fKLglv zUVT00?j$3?o{>ns>IVV!XXCV1ORlwQe!lg zYf{t@Mf(18xrLUMV~^+GX+tM)PhPSOOo(55a-f!CjNJY5W8Kwlap&dJ>khAJT%{aV zV!vuR8JdqG_(zPg*l~X)KnP!%E~j}y7BOs5K5uImHVsge0QoF>Nv|-M<^vwEP`VyN z7^8uXXn1%lv+?>+`rJ!7P*0ga2zG|9kPue~E=xb9f!g-E--XuHJld#%VzkPN8Bu zLxL%O*&h7Ypsb@eJ#=ku;jmQh%I*ghlFveqjc$;>=C3$Fji-Y;=crcBhxP0(I9W*s zaJ;A5&T~9x4}D+t0++^GB&&`r+En4fm|A8pgh)>kjp9{t`4D|(j=9}V@NhbFv0GmO zy{kmUd#a|m_E5K9ru z>+ZJQ$SWcwsfcDD%VM7FyeuMXU6pohW7nxuj

vb+#dw5OKG_yd-t@4v4 zVHnHzQ@Ys3egWIr6E*>wEUPehr#}B{DP-x$3pQcwJ$Y4@c91cBuSea;-bU>+ZRj`E z${U>?X`KlLh2KhTh#KEp&FdO|JJow%vSINmxdRIs`gj#{BZqlc8mim!;7trC_>IGp z2GD3S59!(#^uD(BPwL=jRoO1mV~y2mjVW-tnjWWC&E8Fk5dWZ&@Kba-Md)Slk0zf{OhxZ!M-*5bFetRW(sX8d z(pqdya4HzK3N)3I4E{=5QL9}^*~NKtHOCf7>&q(pcK-$9D&Ls2H(^DMX0p34+} zKxzm}gSy08g4H|+@411p{CzX!hf14YbKt6nhpL3&)*4``)qz@S^&UD-q2jL*7l+^F z>V}%}V;}NoOdp-mNwi58K#Q`atjW#h4Z4~uAd?3|3qqOMX>V}ME|5**IBf`;@6Ti-P_De<2(dY2OF@aZw8$MD?xI@ji>BMuw( zO{HB+nx^fpWvg=-sn*=^7^*h?TE+g&;j`T8ylrk;2FU0XWh$Gl_&aFJqirW?suUS) z^SWww2v#s{@J^<_{g!aUqg-aTdQgF(1Hs@ z%^;mN>pVgw-B1q^Vg<8)RJLD#ie?tYJ$CPk3rb$QvY=(=0|&aK@7lpxLDi@(&E+x6 zf)JA1^r`1C6)bW7!Pzf1B~{(d>fDJ3iqfb?!JwhJJHy2?H#D8=&%{ZVA_7`+E?5}U z5-vP^6_Q3tyjz^eCbO0E&?H2*D8BVM-%E~Ijq?v%12j6GJY-NtNbyPP277DVvPzPu z*AXsiRFUe_6V%kNl<&1%8sOxwg40emDEm)z=9 zDULMaTwOt*ooUO?h!+u2`dnK8r=lsr)HR3dd^5sONo!n6<>TRVwAqxFUOus^JE|AA zY8icUi}Jft?E0Z8^40s7>MEW$8TGn{z`MEKcMPZqO13f~lvlL|Lyo5I6e^{0fnr>a zmRwijysKHR4sT~v-?BC6O?Rl%5gC!kq}mpKPf-S8a3-hMLypby4lrX+i_(p1#(N4h z{b5s^_O!7+=G{9`27h>Shbt~)v`3fyc#-Yq(fqSIjW)T?7bLgO=Wo)MA{gHJP{7J+ zZ{pUYczt!(tEjsD`0-%P8(IDJQG7~nEzdl}P$y;VYNIe0C<(`v_I;$CT|rp|CO4dU zlA-R|$`|cpHS@*(KeELV|6r)tOsx4e?EL7RY1PrQZEmHhZ6O1=giEL2vRMwMbVx<} z7dgk4lM0J`T^ikyD9tHid`?p&o#c`Kp!pMn-0&c z-P2q*5gjPb>7twK+aIxk{x<)K;(_Lc`~n`yAf$c809NF7VMc~rNV8aFU0&7`nOGu7 zl^!C_bvs5|&TQa$lYyR9d4p8jDf}G|9-$|n8>?@W@!>|phFVjUyoPAq=d0VZ^ix%%6}?9u6xk_B zE&uEc=pjip>?h)iZqB4-y`B<1`o(vvw0vYqwNlzUIY9AK2^s@@H*EE((6>@OCm|^= zKJ{yJ?@QAYZ`3HGMBhH}GAkIEEJ!d!mMR@m%HJi^>ps(gzcZ%wMxLgm#=?Jzdnb!R z-q3~Hx?04}+X*1n>AgW7dEV{!x-e4`Sigatd|ES3l1>F9-RxQ3VVCny6xhmO5~-PX zb`rzvX(C+u_SiIPq}P_}Qv0==w(KHg6kb!pHojgkNuMt_X-SWg*d@28(Cbv%{y3ZV z4@TWeV^8pM`oQu6E;`M^Ns^PIcEtD^*N9u5s=7$f@iXzdV*1kdi7Q?ufvT*OKil<9C6- zJW=NnZ?1%E9k=&xj@pgL-VQubM*mW~yHjyOzRlM1;aqyT9TN92H+*PVN+iqU_Pl{f ztX*LHy~hlF`{9JAuf@9@@1$O}Ilpb3`SujgTQ!yprZaiBao)KJ*RuRr)cCMjnRNb} zl=c}+eJ&mAB(alk2reiYR?f^`$k^x~mh{%Ld5?f^Rpqm0+hIO^dx8?bz0Aw38fxQ% zv}Uez_7;IElUttIRaV!znmlGaP%>+>AHOv?di-%@l7De)lYr{Bz%>S^GA6D$7#JT$ ziE#T|S*@U!c+K;iE@CA2hR~^lC6`Sz#k)$@ZM0zGyOSawce{p#n<&)eXWCm0W{+jk zmt;FF7R#b{tEwY*uwox_!ar?bl|9OjlXfJati&)lOYM@dM}$7{j%%~H;Dgn2d^w3i zEIpfN?z=$!n?oWhd%}jUMJ9H6ZfPo)G39q|hn68^lC9vawK?rFZ!#_&+W7Ltvo#kD zx5*GTz{KOxanEiP9D7`_FjB_3v$$OSRE?wAfHi1qEb=&(d-QBZcEt4pp01O983K3j zjX^!D&2!#G7@hD#l_}|S=rGV{OlcXNPSA7qdWN6ZY} zHxFX-J?4}c4C!Q>8$?kfCd+N|dWOX^ti>-Buw*p)?M<;$w@V!mtc$GJ)KcjY#4^z+Q_b)go(-^@7-& zW6n0QdS`e=1X)H3gXpnOIMd#Z?>xSXFE%<2F+OlRe)G0dRol8o?-iO~byHD#Ki+1p zzpzfVX6DU6bLtNF>gu_vm!Fo8j?G&H#x+EYZA>%Nc6rfy{MCrs^`*#LLBXE!8`N0u z%E!B&+HSoHp0Q;M)l6uaE=24>4n<)~wKVERa>8134#aO|avZ;>T)pAG;QP2sFW)C3 zhnfn9sSJnCA9%qS=&+-s9f^Yv_iy*f;m_^k)5-1_nc=S7=~p?al|Wx}S)#KqkA*(K z^7`7Qqm|BtFwNXGiq5`$I!sdJl$^#MC3Sp&a|mg}g0u+6f0~a{l$pAaJ2LiYV_E7L$K&^obP9jOOd-RzauB z4-0322d>EnUA27O$DnC455Yo@loJ}8B@7Z$!Z`XMNq+fhs2J%WJV`t=Q&B;13p1kx z*I-6Mr^Dnx$R<{G^K>^!(xJA)chdU^v*?HdhYkV56LPEGKpakO*H4BPX)-@k$b3gA zktyd9*y(-p;q{Rw%Cnt zX6k4nt1Lw$NNH1V3LDK&Ov2MH%b@FGO<$>835Xf%F6D@oTVoUZroZ{b4tG_RxvBnZ zp`ENdY8PhTjXk=`cJS$yj~^B!c$ZfUi0)-5*&>$|b0tzsva;;||B&~6)9 zygiinA!v^UlK1W9K6my_*XAbLwPUndZst&Q9HKK-q{3d>c=)1rZs*-~7uUpR#um>@ zuA9v=?48ijVt79$bKvgq^pQ7KdG`)?YKkVtBOk7s8RX$*U3zf_}o-w%x}eds|&&-k$39*k*i~UHJ9*&C1Rf>m1s=AD3lyLOv<``wzW* zaN2#F%fcD0cbj|8gJM6`eq30Xewr97gk@Z(yXP?T*04QC*pIK-6SPh>$#Mlp%Z)eu zQE`8eVUI-Yqm%AFcpet^C1yEwBG=$eZ@aDRQv!3W!(Bgb-g9S5ac`Miul6d;v_GSn zP^K#CMPho3L*AuNoE1(<+7iR)jp^M`^Rg*+gmi2Ff#h^mmcFOfEN@x5LMsr7N!M>B zRU~eA+8*z-^RiEu8fyt9cZ7OY-2a;By<5Czx%UX@UL$7M^aRZ$r?+FHNOr}k@a$bBCUkj8{bL#vFPac;&{!s8yHNzZ=T)w{E?2B#;$#_)y8bLf_Hg-Z|Qs zFA;wv#6i=frE`r)llb(c&MRM6iAiMmzL*JQ?u=!c1V z33i9CAK7-sjMH{Y=9}%?*tUo|r4618P7v4x&-=gVgGDkj(yggDldQ>UJG=f?;*rqWvX9!WaUDoJ@Z!Du3osdKqd zoX;75A#6vGxDK1wW$hUG)KnqUFh!|p@lKC(BH>SDd|zzXmTB$BYjG^$6=Us)4b7%B z3u5M_vfgLxAw=r%*2lAU&u=sXiKPV?dzrlhF%B;2k?Y)x6J;o7=Xw z?;sqh-mpF93y@oj%)*L4y3t7Y+0lhvVfYY%mDNm2RbpkKw0h8=w%vSQv>ZvANM`x<&w1KjP)sF&hz^#m5%-Auls0 zG;wu!^N5E}dO7_nJyG$<4zEtUEJ9~BNb;<kNPwN_S zk6EiW&VcK8?mrDWX^xI%tQ8v;gog9azdlkcD1hBrh%P*IbDQWw9O|b!!?A5d=Tyb7SjKsYl{p1IMjix5S!6w)qZjt*VaXC*f zvtQ_e4>l;XXiXfQi&M8@vW;h*+U_GJwUz3Hou7R++@q|~tliL)aPu}+@I%PP!cytG z^=rZOmlx&11C(jT+>)xoz6u{Qs^bGYZ-is^S?@VGZNnkX=EV3W(du``T^y6r8iNz=a zi$8VV!NsqK)%mno7O>w}*hPP{rRkZCPZc=k>6QFZg&lo2gAHyyX(3 zbbrCMBrUcB4~n0RY)Ihr#8^uhx|}>E6<4y0p<%UCPH|6jD0%f1b5?~FB9mo<#Ef_D z#XM%Xc6hMHOjSB_K? zyJ9xqW_vM3>zc^=mlqnkXb?jvkB6bMXS71|QC}tDQ+#FTrd~oFVB-^6<9Cdt?rt@A zKM)Xfe%oF51K7uk`v>zZM=miut2AeGG#nY+^F)dBExWp3=)AD4&|#S_i0qkHnQR_| zbGl(e8}#7^?-##I@!cu9Iz~u?v1MwUg4O2`K)@xl91TbXFQ+(e?;OIWS2^YQ%^Ex0 z$$Oq>G>;u+G+oVe;XQmLxX&P$H`kWQ&&S2~(Ds9~`XvqVVewpMNbeWzTNG$A`%4F& zMYt(lQOA=@SrQf2lrfD(Z1;+D6{H>)4${RQ?Y1~!@BaR#bWmx1X1aUX8YSFhx!&HZ zj9}LuT3aC&BJqT%c@bAQVD!A2gjl!ZVAW~Xb>+>o zM^TFDYrDr!SRN=5m~eFngiJp7Ix#39$%2|W=h@2Qp!Kxy14d*%(%9ZSOHdZ z@$uY%?qo}EJhlIJc1-gbO_7u9%RQvxFAKNLJX#aJ&CJP~+lbg}*I;8C;qNe9q&Kni zD(=)fn=4zdbyCQ(9~s{>pzIXsahr5_(UN?1EZp9bbu5RteM5)u1-)i<%uf#H-TMZ~ zXOr!uArWlDu2XsRu%gptgvdYiUzhR~33_j);c~ULm3@TPDQgyu2&sn*D)9wk=9jAj%bCk;;bhxptZVjVX zyWjK?wJYrPf*Z`8HeA1`zkr6redq7sm6&GJIN#|;5F46R-#muLDGjH!iu`VB6|R^F|$wxsEL@vgK25F`eaW%wJb1DcbAM{A z-l0Z|6MVdA?I}h(^5Z%lLr5AUm-6mmCz9lVcP_`F_);2x+@{HWH> z5&@Da5s`N`X6((K@fit%y7{;`+F_{+POt7hJ-8DBi{WMD#U*VY9UpFlj#YN{ZF_Gx z0}5W}5T;A+;fm`E5!!lP_%P@Ie@3WC0-CUK-sT}FQm&l!c_j3f?(-<*L;lPVo$=4C zMmnax4oix=Zy7>5t|!+>IuXzFne|3if0HpWAGIaiKlIMYRE~1p(MZepQF~KHqMu^j z9_EyA@Evbc>Ktd^owa=X@NtEi5>g|LgE>`+b1(1$*oK0JHF3?sj%MTnyQm1!x@m0S#TP72 z%=bt9N)$CYS(3)9kFwHiD^t2sKdho1=p0v1Pgm-=b8hn|IZfjU?^paydAm7ZoVSW0 zsl3>KA~$pkiw7G&azMfe#I%QXZ8-;i-bA22lCSs@qp7=kRZxYl_QHuauGL}E#m|EF zj%VnJk2vVPmdp<`Tr0=KUOHr+sfS+wDJ~7hvO7}<>i60 zop()Bd8K5A?Wbn5kFoAJ$)jFA$bz))RpGi;d@`V)jPvFqM8F@7x^K(!p6?GP3sXuW zTqK@3uU-GD9WKxdMOGEx(3jY^*T7C&jy~z|oOyul2 zd#dsX>C(C@A2#P1_ho48enNWbxwF+y&8_trPk3#R`KhPt$~Wrce$S2K=u;h3vhj4O&ty?2G(R|U8`{<&LHP8ZK zHMNW^uoggNvc}@uQGmYP?w&vc@6T;pfo`ju?07ejx7+8=f{U$OLEb;-PIh&7 zKM>Hb5%@qR)7{-WvLp=o@5JU|)sG(-0rl|&;#>xJrcQMfFRnGP6J)GP<!-&troGT0?KM${8VHov!m}`Y$^ylGCD-2^8)@=aD z0_esse;d9fi^At|zOTa!-;D!wTfAOirE!38i{qPD8V4x1I4-o(I6%6^@og)O1GHNl z7hY)`Al~A*=t|=N^%lpsuQU#jZ*g2=rSUKH19o)2{C!Iw=ok9Qd>#Ide$X%U1Ftj= zW%#u+D-4+eRHp(jw*Y!@!GV`gDL}wz5(xvQkjNMe5~xBaB3FFP#b0Xt@E4k?F7@O7 z7c;|uBK3Be%y6i_k-4otOhXN)Z)F76RI|jH0BNd5<|c;P+G-YR|45P5f15!5pj-Yg zrFQ?7j>CVU#)=FjL4fXD|62AoP-FF_UFDJ-tWaZx_-Fc~AW=W;kD_I5t6{96i-04n zu=-XyC_MwHjVTsqu8A--wNTg4Ub)6f1EyzejnLFLvo_bV(l)WwG*?4vs%cr7n^~({ zqijrnK7AYUKhYlr@@)w>?6>~*P$C9K{C69qz)&l{TxOA6zt>oKn;Dwl0ByK3Y+dA6 zUO&{oDoMCRy!|9y@_^9%VPpr=n;`V5wvAq}^(fwFYx6#Ln>~D0rW51{lPWA{M`%qk z_c0m0)6m5tD_v41B-<%W7grC-xG%DbVMy9Kv#S-A>VNfBPwb$e!;byyca2#;@~NcI z(Qzegifr0O$4)SE+Kvk9??l`>vp205kqu{d1Z`{6mQhS9scDx###fNPxw&z0eD4O2 znJu_PYQQG(!&T5}i9E)@g~*_md%i4BZ}-4TomE~g_-rGTG_J8bs(C%F#pP&CsUX-` z4ZrJdj+tc+`b_@)fSqc?8*ZlY7Nu}2T!=VveyrVYUN57*KvHO1?rRIG_2d|w=gj&y z(02`rrkPFa{TFs%W?R=8D4`_`f*+pYdAUo0E&Jx9jOW#sNZV~bnk7Ed%-cR3KjC)p zb?W$@lCEi)r*Cn`*N%zpEf4F*UQCfMS9MF$^7!zn~K7z0J3z+?&xMubv`L?Yq0_wN1Oip;;NE&fGV{Ry4846ItK>!FNw z4YbrW;Og2s`kF9PD_cE1xP^_SHcCs&0w^%~2MVr$mI~iD^It5uS|Z+m3$A|Y9X9_2 zD*w9`S7Z!{NP_>d>&idd(hU0NJy(|6oBwq_{%VMPz2a*ZEr+vwWbs+rrMOwCQz)Qxm>Oi_kN zZ9{c6eVmEDjU`ai{j(KU|06B-zf^U7?Ye>@LB7^S1KlAgKhbstczqP+KdPMnqwxv` zpTQvZvlUm1m#ktm|0gQ0)U`|W$sDuY%2%z zLDe&+jyG1t8dyU*qsqdt`BNtsc2$TyJ^ ztk1J7|KTj5e84RGd{w7wOFPDCR|&E@nj(8|QDoSzU~=;_j!R`b>okpcaxicA#yQLw zjIo~|L2A1qs)sj?2(o01iE2PM9;=XOC>dZvhb8TSz4J|);n!j2p9y2lh1(?a4-V%^ zb=@AB&9`Qi*ooDv@Jr5CbhN$o&h~}34u{OX?V*w7mvbIIvI?`5Rc=Xn8PL-Ea$&6y zSS8)$yFH1|OKJPzQapKOIy!kD6JF2>= zaYcMxb&j`JdxRyiiZz6_RRo`qlnU>ZOcKk+@f~uaQg38_7KdQe^2?w9;qoWBzApc1b8E=i z48sZ~@%Zx1Z@O1a9=N}K0u}6xXvwySShKBSK(B z)+gK>0SpPE+sR6Rw~Tvw9NAspW<=w%J` zQf$jS=FjOIdT%0Y*1+s0wyYyEfnmR@>s0!coMBF9{z#Tbd*7)N8gs;kd9Ak(gmzJ^ zhv+_?VJEv#6dneT@MJ9YM7R`2YNQJjF{;h}8p~&!T}k#AuxLt1c~4=fwiTUc->Dn< zu)$BUSKqwq{g`FAMqt&2eF?pnl2>yk+>BC^5%NBFL8-xSb?Z&Tiz)Zn8~5edP#o5< zbiMrO#lG#;O#K**(i{HD{>;wF0j{GP6+&utD*0outT*~n^j1qu9!781xVsgyImo|$ z=K8S)4^jQ5RmpqPo8o1K&jhl;a#|LPvequI27gR7$3IYQ@BlKoow0~B{94MLMsCJf z3pUfLx3kB41~yr9DxJKOog5XDaDGqeX2_swwax6tL+)3f8X)FBJ&-ucl2IX{t=$=% zjSJ=3>Ewr-^5(rf#By^JrV({tm%k}l93;q|+&LQJLwR9)dCIwi~&hYpLlphVZ( zY{i#^VnOui-(oXhnhKX5v)v>xEnB%Da+^cwTxvl#=RVhWi5K((5Qe$3TfJ&w(o>Sb zMH&sNZevR4Y|gz)t(iE*fxn~Y8l>}lkM!X#g{kfYDO*HH7yn=rMLllaH4$z*bPGv_ zvjHrmLdo(eYj2d@ZXL*dUv{W@yP9o*$5EKWopi;0wihr*PPd)Wj7(S+0nfjeZIG(C zpWR#iyKIEN!iammA*rH>Q3##EMI-Y(MOH3Y3qM%}GV`sKs1T$>ZzGGkcP2zUxRGP0 zUw*2zP7Y1`^n!ZmzzkZa^WsDCsf1e{EPGh_!{7;y6+E+-crtJ{Lo=+2NPdW;R)NfYm@{OF=*&c}kO0uNAWwX#*E_4p~THk0{ zo2c@LztzlkW`5tU^yHTM7ayRPWNWK0oYQY+IrrpV@}~8j+9{m}-Ck`K^?U0)zPf=m zB_?2%Wv_r|5%(3Xk@9nMiR7YZ?3S6mkmg}i0k!CL8p3+FTsZa2veJ%eUW>bT0^87a zGmvdlyRZw*U}*Aahh~f7OD8?)2ni>)(-JHgCYJbo;cPLeK#em(=Ha==d3TPnE|pgHHH?LbNJuaM0 zV6-1yA@0o2R8B3>o$mIWxyf1`9=3^NGswk;qmErVG`o<4)xHqj@<{M9ai9NuXOR9y z*)^%7{!s+#yJ5Q^4nyY0v=r}i$LEyR_Fiel2I-|bxVg=yZ4KNM^CXEVf4aYi_7o%j zS`fmKxB3B(%SY|JH-`%a1p@D=LynC2ITCJcs)b+KJZ*Xi?7{wc@R8GzCsO;k49fzU z){?^yo4qXETlsR=HUd;A;#nq(WcKNI0TCN6-8y#G43#KCcDfkUWMsbHs<0~5++u?< zNBfq+i!L}%yKFt0FB05y$r%onF)ltNoHMT-0Nt z&#mBp$?CHdwe_jbE=$7i2&e&>9zeO>;g7tZsnd#!ap&+vY&*W|PVXL_~q z_Ez$926fEoO5wrY$y7 z3uNLoaS1UT^h~#@$CU!ww6Gg@5tHTjOkR~Q>}_PJn-9@WTD!jMZX%u)EQyToeccjC zK+F=!z17_r%c=fY|Ii^QGY!Sa;S264_kQNCs#k}~s=ow4emOGZuaxeYFrL4`476{p z(Z2c6Q6l}m4E{Mc-dDF+xEQiQ9Mz#3@Wd9tJxVI8c70PvN)gXjOQodi>vE;5yTg;| zeJ@s+PPPMwSi)q;F#I~=nG>}4u+*xwwOPr{gRo!fEW-BpMqg`D6m=9Z1zECK2qtyy z)Mr!bRa=g+`w8kzj3wah;tJIVL8Ih6B%LD4#i3S z0P>7YzobVhCIxURAEP(?X!NZC�u;oFr%Yye_TW@jS3M!w|ZpZONN^1*?EoNMgkj zp}vT&MeoqdvHF7MYV_ufGgJ9dE2`NNo$Cj6uNb9`FAo{zay-jo4)UUk0eV_YR0OU) z6~8FeQIgG?Gut2E-4-^1W!-JJHL+wi^E3hbor_W8>&d$CF+k1e0neOJz_C_={%BTO zcLfutg*kE6^2UUZXF8P|f;qiS@)6?8@sh_jg}t{_x4G^n2C3`G{gk zhM4S*D!h7JLFt~%nhLJ7-Q#X9%3=__QL;L9m7BFS+1kqsxzq_4x;ix?UHA+=7{NwP ze&%W28*Etoa4^5SYEXMYXlN@o9mR4T`;dmEe{c~?coF(&GQ5bRkV+T+-V&>=da;^w zb~U~cZ&g57{(bFW@p{Qp<9gTnURl(-@J6SpaJHVy>H6{`d@`kI$-c5jr$rI~a-VJP zLE5~Ylo>&qf0>P7EgxBGEY2FE2?&OQrQndJ@I>Xueg5b`~g-K{)Z;RW1Klsm*>QavMs zk>KURRP@q?4{dA9(rK|jh@+Y)ynwir)K`tP>2-@Lyw!}RF%Qt~x5(_6TA;Zr;bu|C zbAvDqOl+;y`!t-c9L>AH{+zK_9ib`L4Pxbv++jA+%~};iO8^jeQU9RVyyxn(qy(>} ze#?#$)!N6r<7|UtmDU4~ZG{9jMw>j3Q4n3-=Ia*YcF`E&`p?K!U(?^O@U8Dw7`6b= zI(!u*^(M~s(O$~XI-yV=_XDwIrlv`v=uQwcQtMX!TlZdXL$}w1SJECKs|KMQn&;L=uz$ZEqEK z9xF<~`&5eJHux_XTIF9Y$zZgdTUNBfym$GG7v32u4scY~)o`bmq-hRI7n8-pPB)Ev zV0W_5MGL+uKE{4U*2T_J9B-q$k<@{Hzv3Ae$`}jiFnrm@G{~cDcy>h{)xJNU>N1 z?#dhaT2dBF8^pdHC0MOddm&`*Fr+)+N`58J;r<(Z8Vl%vnP3VbhIs;qiG?x~7691m z+%?;4ETE16=&COmBzY0|!mFGMCwpAi7Nb))t>U%AsjtMG)NQ@#v6WK76lvH&L3Mhy zw9v-Ajmbu@pHQro`^L8!(Z8@yJN=;Un-tb37S9=C<(f^r6ifG#d_qv-XARs%r2$zd zGJV&s?zVT1K39QNI~>9v$PQ!{k=n~c7D zb(%P5l9D~xQrHf{@9!3Y3U!xAZg{Tgl=8dnmg0Ebh6^uJ)#Q4m^~kibVv6u92sb!K zlnW(Ar(4U)cb40-iB{+;*&2mv_W6ut7%UL{q$p`m8Z;lyaJZ)&+_Qk=J*PCBvnzY^wPeMoOptoS<7r^fCn zP3SFyM;<(!(040Msk%ke~s38q3)I)r!oWqP#= z4ewOACfigIEGnkjD*%Qfc$bl7;u4&SN4-;hW$)1^nCl&FMxWK;pVy2{uA0noSo7`j z+PBZvYNq;9_r%LTMe%>G`xDyqykF}6k+B1^{rx4c{e#l0=tR-`f{NmG^35MULO6R3 zinexaRC?VNjMhMoM)HSFbibSI&F>y^S|6D~vSJmv?7K-}^Cz&h=3&W@3!q)o#Y&D7 zSZr<6mB4vAitSRAw2a&duMW42(9vp1pu)Wy-j0YYtV$?Z3tKB~AH!?yE`(c1BRE3f z#XZ8<6A}1R_=3lkVf&rEoFl!LW;f7j?buY_0@6fvJCRB8=M7nBMBw&HLHDK~-?X)n zm8k>HO?vP_%#1GDJwj*TI{5Yas@ zswCz$iofy>M>OImZ)tgscP~_R;Q|O+4SUrIe+xq56AyOna+rp1j@>C|jC6O3tg7B# z3IFg!gEIa2`yFSAp~~yGd<^t=9p=hr-iLrW1Z8Iivb7UG&>U+G*f-wVJ^!KcVDn9T zs+%F;5el!IXuNshNmt`1whsT+)h8&n?)dnpCt`p9tw`vH6GL(uewAuRuByLk+z{9$ zVwY)ZIjl#jKD1&!p(En-+WDaqyUS|r91|FiKNKTuXi1ua3__74ISi?OzOxssrH$^l zBAtAI2q$+7#JRDomed@}xhp_u>u2+V37Rm&b>5MVBd#L_^x zz%iyyl68wC-B?rG^*zo|lzJ)uB?D&Z#d4Z(`IU5+;P*PQPB#!NttkQPT1lpPnt);{ z`k0&g2Yp=no4m3Gvkj_>tGASta-gb2-g8&xFWkCgC*Ic1HZ`l0j3LFmWO3thMBuW| zNekXSt!e8GuklM*#c%6`#tW#-Y+n^ywx0#d?5^vw&wBdr7xWEyQm@xPoRKpXuGd|f zOD?6lIqiL@vofZ96;nX;&vBkJ88eu%qQJT z1SeIqxUorFz>+iyAS0M=SWp}o>WPsZtA@X3shF%4Co73u17oP8?x5Bb9;&S^gp0?T z+9^FJ1&5{-Ffj@B1mTRba_DO`9WW1lo|I)pZ%Z)P96L1Rkf$}J}%0fT+5J4m3pDaERM)4wNg9tp7;`z z4+jXb!iWVB#K(4~lJK}?aoJXyhkC}lf2$QZsGQF%Z&FUDj_?LK3U&cEq7Sy;=4v$* zWj%Qq*D!H!BJnrFO)EXLjy^oiYeP^SXw%M zXU?-3`J>ZX7zJQUk3{nSOxYO+lZjJEgizUa7>CmOVw-$nr4Zl$4bdOHYjk5)a-oB5 z=l3(#e0?7SMYTC$a_K_AvdG3p$(n%y(!{nCCXtPi$)ucR)PX{bGfEutkEe^svVq)0lj_)T`7szw5bkQ{HJqz$~hE$`^zm_{BX zT@xNr0T@7D28)a6yw|MHkUXO3RVOp2hCF>a#K~_}s&i%?(@`Xsh_bAiELwWnoSQMT za6F^}V~nJri#rrL^2Xw5W^|BMtVA7;EIZ?VG-FJ*53aF64_>@u(u7J)wwGl1PRIy6 zqxwyg$|)hXc>#NEk|#m#`Syy5djk*Meo|9OFwC5L5qEhBH#WC^=~y(@@KWQ+yPf%w zc4gpQVN4J?9?)xf9!iJfjUP1sX7}cUrznMlhEa z_ir{$W#p!IS#19l(%znc?)sW#v7nl{jlUI^RF@QHwg(GMaE0Y-hmW~2qw&BsDeLzZ zaFW2dn-B}Cm$+^ROwbTWG0?gubYCw5*6KA93C62BwA`=nfd;PY7adJ4vs?meRXzyw7?Y_Wr4AQ^iTYcbE(pt< z%!=W;^T~9_%{N?hg1$14)?jjCSU^)N7I3Cv<~DE~veeAdJEqr+1uR$ErJk{z;vjjQ zdfy|F4(rc=17?2xPmKZ-jcwUccrC3x@66sLOcYoUQ@c7 zcNQdmdqA!gT3)OGCoRf0OWZ9ZNi8HFvW16$*E1#JsWyh|hD#rktY~29#aBu*l3<{J zAm=S9H6x2CSQEi98g9hxeO2B!Fne_bQ&)Cqs`xHYuvcpgKq^XF^nPKa*~HLu%7h`# zF1ge)Q#R3PX}fkew33)(QPb4W6BBoN+WiAb*sQ-4n?xpnXiJ+Sm=F$m?L%&rl+AO= zs(%vkBJ5ZAu468OQz;n!6?_MJ{`3=k_l1c1gr%^3`EB1XFx0p2SSd!R<72(TeZD?E zwvu|V+HVY!vF+=-hT-wlNY#SrgRAVDes`vG@eM04aUbmwjhF2pg*?lR5Y@>5xd+I~ z1l-`zXHzR@SDZ^9Ae@?}+68J8_uHh4WI8m+^4q|}A#8`iu#r(okK$x(nNA9=vAP?( zOi9|ShQQqN4=gaYZdIy zjJ)|>i7&1E^*Z{cfshMc2w`AsM{mNi0>h00|{hVQcDLpmj|7w^rO^Sw^%sWqtMYvaMR!6JIPs$-h;&2f!- z*t(gA6l-gJ++WbAo}8Ntv$TQD31hQ_;Raq6dBqSc-a=uFf}sxRVt`gUb9@{}6gan8 z_v)nYNYWnW-jRLoG!~__g}HE7Oug7eEg_rH;*v@V#)^upj6W!h&hOt2V(ZmTpE8hRQlsA=sy-|{Egh~cLJr#5}K-- zAef3QNLxx51XJS!tE$Us!IWVV>MF|8K-vF-TsQE$xYVbS`#%xt{v0oLGdH=x#c_oL zh0K1o>na%AAzh4-=7Mzp{{^$ZUXpUl}UCp`e9g$buj2%rs-szPucX#D8N^NRv1Gr)AVPkJ>1~9Wk znmhe^+Xxg!4A8I#{3raz@2+3X{OAAf2jJJA5R&3rY7&ytk}6^_Fib~UN{e4vURg_q zPlZoHO&g{xhGHpx6#v$ckV4t71mtBvx*F;->bjDWk{X&)nmQ6+LZ4h5It*HwK&jb#*5%^drG;Z|7`o=M4C~N`8|d|AV#kMHTd^ zJl|bQU}IAg5JUh1;4$Xm0f4|zRKwvHKxt$^=B7|%AP)j$0{kCdOJ5d* z%q#fMn~pKa%-H-d!5ZIbI&dvbc^Or>DoPmx6xUW#RD((JDM^4p{9sKP4P^oGZ_D#X z)A$GWCLkDv3499^{koF>Z9M->h4@z*&(9mok5>ynNB|5r=lz<~{jX(yQ1BS=vn}e= zMSOR=<`d-kFXD6|f9OJ#1@UQuWcb8TdL*#409ac_Sx!z_R}uyXDhbGI@TvTG*O!!2 z1}T74B@`uKs>(`Y>L9q18dyVJT1P=bN**R7_jk8z)E)nKtCo48g5YmBUDPfJG%*%1 zH8%gB;B+CsqlA8up?w^#Upd`qwQjpLx{FiCoQwS~fz)mepvUTLUr_|%%6Ge+XiIm)j16tE2`UGTR76_5}R z;zUC1*s5V?{OY@g*!ChfTwh&o*mJ4&FBF)O7&+RlFtDY#WXJTBWtoWrDyD=BvdFjQ zIy*!W+UDRm5!X+V=AV`uZRm`8mdU+Le?>!GAmMIg>HyvBCjEWx{vG=y^wq8tAqmpi zDQy}vt;a$~qAZ!=E^Ef`+(PeTr7GMDd-k!Kb3ZMf0K2TH?ZQr4;0%e9SSH$9d|9xpwhr5x6s zYo?E2eP#Jz^3Z+rl?zqo`<;@)D?68kj>lo%{}9*Zhwy>HCZ?tUC>Z5v2bn;j006kJ(|^)pKN6>Zb<_B?t9)0g_D?(P-`N_!(_vLLmDQA#!P@-V@-irJ z7bKt{tEwxZA+IGbz$YoDg0i#!_5tWW=&&Hr4{+U2gZF=;!Tz!({J-T#{L-BNTX9`} z5X$at4*6P;_+Q()fq(YG__2$jc-Ox^kAHq1`S~Y)#&w~*ynjgRf|P+kDG3#rrlzc< zqKvq>ET5!=fSR^|k|r3Ytfle?v@RH|!6z=Gsx77@p&}szRMq7Jq5i0;s%l89>GA_n zE&uPmXZ^TwD;PUD0~Eg;F(ybS?P$qG#{;4hyWvR3$HN1p;{^+TY{}nnxO}GkKlkLn z$>Bm!yX?1P|C^U6)LC69@nQDg$8bOX$k}rme@ycIHSyus81Av{x0a6oS3k(~m!Yfl z`_&mqsq?p-k-r%_f?M57I}N8XFV^-?9{+TqL-U(d-{+A``08^5JKUWz{Vx^XF*!`O z>Ix~Hh~dQ@sZ?~6tKU6&hj%XYIm3Gk1%|{fmTeDu**WBZa&VvAGsUy ztiZ9m=~C#u7p^-SG0jU`f~R&b=J&~yf4cC?rLtK(`-_(hKpc23ZEf&Bv2QR$Wt~BdSfM)+gms9c?(trVO11 z9CpS5wO*T>0*|U*l1gJr6`8TYLY7IZDYPI7qZFMK2TLIyh4r;#iJdsBV;<1)J$9-t zV^{3X)p6Wo4r+NJYrROXQap>ArLXRZ%jVe^RQs2En%X!epyd`-GW>;a{&4>Cm8;6v z?Qh=7y!YhA%)4LEU*RyG?B~Jw)PmmFyg4o5B%JW3FmztVpl^&Vp9}rfpj<>Bro=jP zZaOSD_hN*8qFsKy>*Y~P0Cb>?cH*Q|s2ajX%+0!`50W%z!H6FAQ!VPzj;^?`JC3_J z=jUdSTPEDZ0n;ObFXmA7KByfS3O0R5ZAuJXTF{8Y&<^Ao%A+7_Z-MOAVXsi=vO_Ho z!@@me*;ne@kfX$`a-QTs&q}KzD>$y#ct;x*o>0Tn&V%CAq|P;up*3u-CY`90MrzO0 zRRz5Xe8Bh-je!Wj$5yfT(F5sqYEf=OC11OYn8y{P!N}klL^Zo9IllVA(@Sk9yWH zS)8h5Y;TRp46SYWStUF^xCkeVJfzCe(d#xGYNa%TKFKk-F_`$mgPQGn|d$gV`j zPT%pI8>y#csRJtK9lEGW>yV1&=#2RV2DSu|x9aV;D7q_2ms?xsiNQ|$&T;NiZVCI% zflEsVXP4cvh>{;dm)%FD+s8DKSm8lAx8s&q+G+$~796Z#Ayf0?lC{K9`T4E~9;7EE z-h?c=!z?vbHIryyP2zY>Tlto1n&gC8OpCM)3BbATP=mg9_49Rn73~qCAz=)UM{Tcl zrKfdXt-acQ{i%Nz(*uqU0=jEKxEOp}yKggc!j||n<0y*X<+M4tu_1!O?hv&wo+mG0 zs4Jio*Z827GB`q%-yI9kPES1mAYo}j(4#?*ZKQJ6V|PsEh<9h$VG4`B>>IXAUCe z)>5(jG3^_i+b+4%4xH#!i51KnjKKg@;UU@ugKJT#rd%uA4E4a_1D@T}iWa^kPFWTze_M(gU@0p}zJ?YwF` zgpa-H5DdrSbl0sW#^2;3k*g}JcQYHYid2V1B$UMY$rqGjG{~gwAs>O+uxi;S^z>3( zmMgfg-aH`V-yjiIh_y=hWOJ`gA!boZ6ht28vEP|2jWe2^q>>}BY|oLGpe=VEw1nlT zNf?}vJCVK2;Ic<-q_~u7-yd)BB^`OVE?Fc!!Jpo>S3XoHvD8p5PyW{6k>i7i<3{}Z zvrmJtsJWvK?xW0m>W^MyeS|)LuAC{E;76+P>&pF%rj1qoil+SxGeuHg{RL(^{{?k< z5+i~_U7G*2-l#3J&h1P{jtuS`WspHzI@{8Nx7dYQB8fBjvkyYL9gOg1FV@Y!;-^|D zCUgFfBg|MW=a^GDKJfrzj*-kcqvB#yqQ&r@kbovWVg9heCAD8M>k9GXY!&GOgjCM( z-Lo(=vXMZ^fU9!i!74hv~p=wJxPdk z*TD%-zwx1OEyULbH?vQnfRe^1uEZs>y@WdI#vIoQngn+|hi4zRt$fb0T@NIPVr58{ z948M8n!k5D$azbg2hogq_-_h@S52dv%wdd)(g5 z`G_O~=HZgedjX`FAM}Q}j9Y@wlGBzY+^gucVb;!&_b4{+O}c`tnMi)fEq38-qNuGP zcy*{m@1gI8;GQ)P?br3b)DeJ!*M3~@FKE+ejnpmCpE-wL973NthfdTfyMMe3_({EN zL?Knl6d@FTu886ovxyXH+$fsJXYSV`p^eAv`m_wJ`;26FYG}7RKpb~pX}!a)rhmw| zR4RwTY;>Jq!#lt4<)f-_AMWYlO{yw`M7Pyy=R5XD(3^ASn9J=Bu)Fz5eC@eOgF4#+ zvS!fQYel5{0bxl4IF2Ev6z+3Dq=`vKj1h8&btw;us+ZV6EKV{j#1VMo!IAJ*>f?su zlsP?lq!%^bSmbUa7k>>ibY&rU;8FVUs&#Pc*f`B1mS|bAMC18KOx_LCYb@qz1{<$9 ztO1Y8Tl@(c^$%1Z%1@@=Jl#pw3*iZ8_nrv}zvgzVqWnONiuK&9c~v=}`XDVOUW@o( zhuqGGPRPjptZOzbrMnSx!F}u&ug-8A2luloC5bFi_8i6-6lPz9aK#i}kO`>Yu zvW3ScFT+?q%ZAaJoyni>OpFJ+tPJw(YdC7ZMZk625)TT#*i1Hqd9FV)+9;uv*?Z5}X+E^*hVJ2mbY+c7ac`;zL95C%M?~ zX9eIF`sHWH;j=^NH;_XxnITFGv;8gF{QLePIj!e(!PjKj_$hk_L25SH(fm%m-z4?d7$zH4DoUE@At5j0u=sf)dhC{~>>-rrd%Mwqe+ayYx= zQqEVwIS%DGwsG+e+DCQqRv)ZwFLaNu$x}9wC?p~kwy<`S@LRek#`8T*3a#a>(88?L z(t%xPMY(-M5`0S8N1A&fsoWUKK4X0Vx{PrGgB>Ba3l_r6NIsfU36>EsPo~7MV=TFnLs6!hsg+PtEnTImVAS`jl~(e zZEoZlJtrZ#M3;v1MFn9HBmr|LvD`8tmW5R-a9nx5D*#%H>0NBuU7Hbc@pK@8V3uFp zv^+L{+~w9&WBV7rI@mqy7(hkRSTSi5${2Iz7Yb-Su{_xNMnxGWLR$Lyl#N9Br#n^1 zxYUV;psGFht_DHmL5`3_N>%<~fp)5vNQleIJzWUzdGz{=c3chA@1@z?3g5jBa5X(A z8U$N|6ec$6_TlVp^YtQy+NCe?XH0ZNy1f&@@n(AW`bp}QTfR+p3knn44i|oUbaJq> zG;vSs)=$1HZHZsuF_^R#V&C8~pJ&dusKZAs%x?nfUko0CtKWY!{_2GZ)v_qPW?W{a zLoSDB=!Zv`mOfkLXm>u!C?T-Nz0eY_-<DVF7W^h_vP`t z4E*@&5=fDz=4UviG%CB**=Xusc0PEXPdMnhAHT>7A)*K!ZYG56)uP4k7u1Na=s=f3 zERmg^0fTG^mt^Sj*qggX*oQhxTkRH@$2t~Npj3bNljt^8*sKDJUn07FypDbfO!piUWViDgo>L?W#(lIx8V{-gyxEObheD{1V72y zoWI5$2KA)Wdu$==l-udpJ{fHcy??vI9_n+RQJROjH)H06jT3%rE{o-fv+&fR9qXW+ z9{x2sq((tR$jd4dQg{wWsgKYAg63!_72CSZsI_IQ=R97Ff?}3}9Lp8GyKMIQnwIzG zpDU(qnZK4PU42;qyx?hlM1RFB>KcHvrpW`XrSlp8brrO*X+?M6mGUNm5er?1#OAZA zmsl1{#(UmuRv?Lus^IZ=H?X-I06Oz(Q?8Mwcd~fp2i%s?T;F@{a#~2*HY>#*F-?UZ zEwevC4wp;bd-+h?-Z%Q)4xMt~erUzW@!s*t{&8m549n43cN~w+-TrC+CH5pb7@pwh zEH@!T2VLM*qkz}!REUn(A+)a8bDN_w)&$ z62lk2J+ta}+n(P8Y(1x$;L8xZspae$_5y@#0Z-TSdrb%{ui7-C=5$1I_RGa_Qu8)= z&x*#XryQYuvi{54^&3Nit)7`O7InvbBO&YMqY7xWt#*r0>`bS-9c<7rmLc%GspF=^ zIcZpdd*InK^>x~?H+RqC_{l641z08{md*`2CyYqf22dVRjyI5Hc{bFtWeGMmDl6c+DM1_82&Kdp0*rocPDbx82N9~8*ladDMgFWq+giDi2QWTaSmPQxehVhc>flYC41=a20X}>R=C_QCcnH-a~ z8z3>OFMyg|yRPtXJ=S#vZ#j}*+jzqU;5g9SpHcZ(N-wnAkoU&b^o016!xvKxP$uQH$|IMpT%+4S6aP=^p!r-u}4QrJ;c5_gBYWOLG$q zZMjUNy{@X-L4RGEK0>D%_)38C?PmH`Z28Ot42`*LgwM6gHGDBU_Mxx$;X(V(h3s@L zwTmp+%I7k~XZ;~4c0wgJZY~QGaw=miwEe6$Dwy(t^ACs3M++`?%*TlCVN)Zj2v9bepXcP8+hoGOS9<(5#hzkS@lA?d z_fH=0#v?y}{hC7>mwzz}*-*kg@w^E49$5a#9#^~T_3mBwt7$6*jH_z6*Y9A0#4?*N zsU4-_`b=!>7BxMN)mdZE0q1m@-tg$ncb&4D*2nJ6o!K_JdsLGswKRy8gwR|dUDDXk z$E*T5JtY|+qjZziIQKF{KT+O=8|Nq$2eJNM*{9&J?Z?YkO z_?hA{y{>mF%#-gDfDC63H+Z#Q&Jgtxaek_WaZLjqDKMKG@SH#yU+9%kNhil>Xj^tb!lMlP@(lLo1ZAF%ri2jHb4UBv z{co@>KlgazO*qCVD*}@bA|azU4pzx#=ni2eX*CKvIx~8ipD`k=2c_BSU;v%>3^NXT z(*a|Q0*>o23JR3QSYHchRAQc*C;rK7t@*37 zm4P(slcTam;Y_nQ z(tf5mYJ^;Odv;y4?|6kA$3gVk`1=2RIEtdW7@rsPM?vP0~fQB$3#4-G~QFe9$5(-N#NY z`$5kg$!;g{y?6x5DH_8MDSN4k4|Pfd#$y5;HJtboR49cFO62Z3v-w@aS)sE&Sj&;j zu>znENRE;Vy&V#`nvp=d|cejig{{rJkGrJ=DIA@+aeu$ zf*D?Ft>5?x(J+*F5B_wV3m0IxT_gx|!md!WQ{H;fT66cZc~!aR&J)i@_)3-!i1fC3|)JRO&QAHB!sVQiC7i(m*~M!Q1jYN`aL_^3+) zzonp$5muU?B!Bdj?M;Lf5!6gX7Sr@itPICkUR3vM>7NN9MCU?+AtE93+Ao9EC- zEM}oKH#DG}&?YC;*)5$X$y}6FoVR-frNgi+*}Ap&^>)fc7M3^4C8x^6KM<^Oa@!FU zm+D#AoQf$z3(D@*U@Hw<>0I=FwL?GWXfaeC{8Ji_KE;`|w9t$9RBb4Z35TWh`?wKr zD9jmQtYhOoB+yu_E<;nTO5j{&guMQm$*`oy?Z~RT6CV`4Je)e7yFsYLoe$m>)jL%f z)fc-5nkR=mC-QWFubH0D>+!xaNjbN5Fca8?usS41~Mw3bxFDrQ4vYIFgxs|GG zW>w{w9&;#ZWI6e?O=ui2xA_MVPk7>RacLPw4~3l7caLC4i`MWY%@1TJ>N$H@P>lt5 zz7U{6iAERTkP#DW#vDuhd}#MV@npp~B8S9Qkxt=)agA{Ml(Cu&!Cd2rCzrBRP3rM< z>29ROTA=?NxWmac!%iF-6I1WCNW61PooL@zjfra8rIOb=zk!fg>LlvMx!A@TPd9)1 z6eRxp#4zuhba{$_r9t8uf}I|YhwKUIPKGxm&+4`>1cLHPk@ z#vfrCK9Cu}golq8AOJNp;V}jAK_QTTgaWwz)r8c4XGrm{hrqrQQUq&j%4q7Siot=} z%3!d(l&XS;u7HdVKOb03MpK3VZ_zA2!T`R@o_!j*{}Tzt|HbUtmyGPk4BO}E`hUWn z8JnY0gC;yE?99vzhypbEc>z#R#sP?UsFn1Qb$2ql3x-guc#<5ucjd@0oD-Ekoz_U z_(xIVA4M?vz$obLZ)YLD@L=j6xj;ujRM^oNmNHGk^dXMZ1}qLj=rt z{+So@>$&(NjOJq;_s28vpC{r`;0OZrmtdjqG#ddm6 !(x.getCrawlDate.contains(date))).collect() - val r2 = base.discardDate(date).take(3) - assert(r.deep == Array().deep) + val date = "2007" + val dateComponent = DateComponent.YYYY + val r = base + .filter(x => ExtractDate(x.getCrawlDate, dateComponent) != date) + .map(mp => mp.getUrl) + .take(3) + val r2 = base + .discardDate(List(date), dateComponent) + .map(mp => mp.getUrl) + .take(3) + assert(r2.sameElements(r)) } test("Discard URLs RDD") { diff --git a/src/test/scala/io/archivesunleashed/issues/Issue493Test.scala b/src/test/scala/io/archivesunleashed/issues/Issue493Test.scala new file mode 100644 index 00000000..10be0b59 --- /dev/null +++ b/src/test/scala/io/archivesunleashed/issues/Issue493Test.scala @@ -0,0 +1,66 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import com.google.common.io.Resources +import io.archivesunleashed.RecordLoader +import io.archivesunleashed.udfs.{removeHTML, removeHTTPHeader} +import org.apache.spark.sql.SparkSession +import org.apache.spark.{SparkConf, SparkContext} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{BeforeAndAfter, FunSuite} + +@RunWith(classOf[JUnitRunner]) +class Issue493Test extends FunSuite with BeforeAndAfter { + private val arcPath = Resources.getResource("warc/issue-493.warc").getPath + private val master = "local[4]" + private val appName = "example-spark" + private var sc: SparkContext = _ + + before { + val conf = new SparkConf() + .setMaster(master) + .setAppName(appName) + sc = new SparkContext(conf) + } + + test("Test for issue 493 - compressed payload warcs") { + val df = RecordLoader.loadArchives(arcPath, sc).webpages() + + // We need this in order to use the $-notation + val spark = SparkSession.builder().master("local").getOrCreate() + // scalastyle:off + import spark.implicits._ + // scalastyle:on + + val dfResults = df + .select(removeHTML(removeHTTPHeader($"content"))) + .head(2) + val RESULTSLENGTH = 2 + + assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "makkaronisch fuer niedlich") + assert(dfResults(1).get(0) == "makkaronisch fuer niedlich die melodie") + } + + after { + if (sc != null) { + sc.stop() + } + } +} diff --git a/src/test/scala/io/archivesunleashed/app/WgetWarcTest.scala b/src/test/scala/io/archivesunleashed/issues/WgetWarcTest.scala similarity index 100% rename from src/test/scala/io/archivesunleashed/app/WgetWarcTest.scala rename to src/test/scala/io/archivesunleashed/issues/WgetWarcTest.scala From b1af6eb6f0061beb700116743aa3f4e5678516ee Mon Sep 17 00:00:00 2001 From: nruest Date: Wed, 18 May 2022 14:30:19 -0400 Subject: [PATCH 06/11] Remove java formatter, and apply scalafmt. --- .github/workflows/java-formatter.yml | 17 ----------------- .../scala/io/archivesunleashed/package.scala | 4 ++-- 2 files changed, 2 insertions(+), 19 deletions(-) delete mode 100644 .github/workflows/java-formatter.yml diff --git a/.github/workflows/java-formatter.yml b/.github/workflows/java-formatter.yml deleted file mode 100644 index 098c7d7e..00000000 --- a/.github/workflows/java-formatter.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: Java Formatter (Google Java Format) - -on: - pull_request: - branches: [main] - -jobs: - formatting: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-java@v1 - with: - java-version: "11" - - uses: axel-op/googlejavaformat-action@v3 - with: - args: "--dry-run" diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index 754a6542..84f401fa 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -815,8 +815,8 @@ package object archivesunleashed { * @param date a list of dates */ def discardDate( - dates: List[String], - component: DateComponent = DateComponent.YYYYMMDD + dates: List[String], + component: DateComponent = DateComponent.YYYYMMDD ): RDD[ArchiveRecord] = { rdd.filter(r => !dates.contains(ExtractDate(r.getCrawlDate, component))) } From 6f2422066450e6fe885b9725df3142ad50677374 Mon Sep 17 00:00:00 2001 From: nruest Date: Thu, 19 May 2022 11:07:40 -0400 Subject: [PATCH 07/11] Filter on response and revisit records, and Try getBinaryBytes, getContentBytes, getContentString and getPayloadDigest. --- .../SparklingArchiveRecord.scala | 40 +++++++++++-------- .../scala/io/archivesunleashed/package.scala | 15 +++---- .../archivesunleashed/ArchiveRecordTest.scala | 18 ++++----- .../scala/io/archivesunleashed/WarcTest.scala | 2 +- .../issues/WgetWarcTest.scala | 6 +-- 5 files changed, 45 insertions(+), 36 deletions(-) diff --git a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala index a795feda..47f845bb 100644 --- a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala +++ b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala @@ -24,6 +24,7 @@ import org.archive.webservices.sparkling.http.HttpMessage import org.archive.webservices.sparkling.io.IOUtil import org.archive.webservices.sparkling.util.{ManagedVal, ValueSupplier} import org.archive.webservices.sparkling.warc.{WarcHeaders, WarcRecord} +import scala.util.Try object SparklingArchiveRecord { val MaxStringByteLength: Int = 1024 @@ -60,27 +61,34 @@ class SparklingArchiveRecord( meta.timestamp.filter(_.length >= 14).map(_.take(14)).getOrElse("") override def getCrawlMonth: String = warc.timestamp.filter(_.length >= 6).map(_.take(6)).getOrElse("") - override def getContentBytes: Array[Byte] = payload(warc) - override def getContentString: String = { - val record = - if (maxMemoryBytes < 0) limitBodyLength(MaxStringByteLength).warc - else warc - http(record) - .map { http => - new String(WarcHeaders.http(http.statusLine, http.headers)) + http.bodyString - } - .getOrElse(new String(payload(record))) - } + override def getContentBytes: Array[Byte] = + Try { + payload(warc) + }.getOrElse(Array.empty) + override def getContentString: String = + Try { + val record = + if (maxMemoryBytes < 0) limitBodyLength(MaxStringByteLength).warc + else warc + http(record) + .map { http => + new String(WarcHeaders.http(http.statusLine, http.headers)) + http.bodyString + } + .getOrElse(new String(payload(record))) + }.getOrElse("") override def getMimeType: String = http(warc).flatMap(_.mime).getOrElse("unknown") override def getUrl: String = warc.url.getOrElse("").replaceAll("<|>", "") override def getDomain: String = ExtractDomain(getUrl) - override def getBinaryBytes: Array[Byte] = { - var record = warc - http(record).map(_.body).map(IOUtil.bytes).getOrElse(payload(record)) - } + override def getBinaryBytes: Array[Byte] = + Try { + var record = warc + http(record).map(_.body).map(IOUtil.bytes).getOrElse(payload(record)) + }.getOrElse(Array.empty) override def getHttpStatus: String = http(warc).map(_.status.toString).getOrElse("000") override def getPayloadDigest: String = - meta.payloadDigest.orElse(warc.digestPayload()).getOrElse("") + Try { + meta.payloadDigest.orElse(warc.digestPayload()).getOrElse("") + }.getOrElse("") } diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index 84f401fa..bac71532 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -99,13 +99,14 @@ package object archivesunleashed { val in = HdfsIO.open(path, decompress = false) var prev: Option[ManagedVal[ValueSupplier[InputStream]]] = None IteratorUtil.cleanup( - WarcLoader.load(in).map { record => - for (p <- prev) p.clear(false) - val buffered = IOUtil.buffer(lazyEval = true) { out => - IOUtil.copy(record.payload, out) - } - prev = Some(buffered) - new SparklingArchiveRecord(filename, record, buffered) + WarcLoader.load(in).filter(r => r.isResponse || r.isRevisit).map { + record => + for (p <- prev) p.clear(false) + val buffered = IOUtil.buffer(lazyEval = true) { out => + IOUtil.copy(record.payload, out) + } + prev = Some(buffered) + new SparklingArchiveRecord(filename, record, buffered) }, () => { for (p <- prev) p.clear(false) diff --git a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala index 1c921ad1..ebbe312b 100644 --- a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala +++ b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala @@ -48,7 +48,7 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { test("Count records") { assert(RecordLoader.loadArchives(arcPath, sc).count == 299L) - assert(RecordLoader.loadArchives(warcPath, sc).count == 822L) + assert(RecordLoader.loadArchives(warcPath, sc).count == 299L) } test("Resource name produces expected result") { @@ -79,7 +79,7 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { textSampleArc.deep == Array(exampleDate, exampleDate, "20080430204826").deep ) assert( - textSampleWarc.deep == Array(exampleDate, exampleDate, exampleDate).deep + textSampleWarc.deep == Array(exampleDate, exampleDate, "20080430204826").deep ) } @@ -93,7 +93,7 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { .map(x => x.getDomain) .take(3) assert(textSampleArc.deep == Array("", exampleUrl, exampleUrl).deep) - assert(textSampleWarc.deep == Array("", "", exampleUrl).deep) + assert(textSampleWarc.deep == Array("", exampleUrl, exampleUrl).deep) } test("URLs") { @@ -114,9 +114,9 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { ) assert( textSampleWarc.deep == Array( - "", "dns:www.archive.org", - "http://www.archive.org/robots.txt" + "http://www.archive.org/robots.txt", + "http://www.archive.org/" ).deep ) } @@ -138,7 +138,7 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { ).deep ) assert( - textSampleWarc.deep == Array("unknown", "unknown", exampleMimeType).deep + textSampleWarc.deep == Array("unknown", exampleMimeType, "text/html").deep ) } @@ -161,7 +161,7 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { assert( textSampleWarc.deep == Array( exampleStatusCode1, - exampleStatusCode1, + exampleStatusCode2, exampleStatusCode2 ).deep ) @@ -185,9 +185,9 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { ) assert( textSampleWarc.deep == Array( - "sha1:B3CPX3Q4JK373UZA6HDKGYZVSNQDTGFQ", "sha1:RUIV2DUDYXONM2YTMGPAJVQKG3FSYHXE", - "sha1:sucgmuvxdkvb5cs2nl4r4jabnx7k466u" + "sha1:sucgmuvxdkvb5cs2nl4r4jabnx7k466u", + "sha1:2waxx5nuwnncs2bdkco5ovdqbjvnkivv" ).deep ) } diff --git a/src/test/scala/io/archivesunleashed/WarcTest.scala b/src/test/scala/io/archivesunleashed/WarcTest.scala index 03691249..44b9879f 100644 --- a/src/test/scala/io/archivesunleashed/WarcTest.scala +++ b/src/test/scala/io/archivesunleashed/WarcTest.scala @@ -42,7 +42,7 @@ class WarcTest extends FunSuite with BeforeAndAfter { } test("Count records") { - assert(822L == records.count) + assert(299L == records.count) } test("WARC extract domain RDD") { diff --git a/src/test/scala/io/archivesunleashed/issues/WgetWarcTest.scala b/src/test/scala/io/archivesunleashed/issues/WgetWarcTest.scala index 56c2b237..9c90e440 100644 --- a/src/test/scala/io/archivesunleashed/issues/WgetWarcTest.scala +++ b/src/test/scala/io/archivesunleashed/issues/WgetWarcTest.scala @@ -54,9 +54,9 @@ class WgetWarcTest extends FunSuite with BeforeAndAfter { assert(dfResults.length == RESULTSLENGTH) assert(dfResults(0).get(0) == "20210511181400") - assert(dfResults(0).get(1) == "") - assert(dfResults(1).get(0) == "20210511181400") - assert(dfResults(1).get(1) == "http://www.archiveteam.org/") + assert(dfResults(0).get(1) == "http://www.archiveteam.org/") + assert(dfResults(1).get(0) == "20210511181401") + assert(dfResults(1).get(1) == "https://wiki.archiveteam.org/") } after { From efa72a00e76ebbf40f5389d0d2fe9c1d122c8c6a Mon Sep 17 00:00:00 2001 From: nruest Date: Sat, 21 May 2022 15:25:13 -0400 Subject: [PATCH 08/11] Cleanup * Documentation and formatting updates * Remove unneeded getContentBytes * Add missing spreadsheet mimetype --- .../io/archivesunleashed/ArchiveRecord.scala | 3 --- .../SparklingArchiveRecord.scala | 17 +++++++++++++---- .../scala/io/archivesunleashed/package.scala | 7 ++++--- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/main/scala/io/archivesunleashed/ArchiveRecord.scala b/src/main/scala/io/archivesunleashed/ArchiveRecord.scala index 48c3147e..d26b2a7f 100644 --- a/src/main/scala/io/archivesunleashed/ArchiveRecord.scala +++ b/src/main/scala/io/archivesunleashed/ArchiveRecord.scala @@ -28,9 +28,6 @@ trait ArchiveRecord extends Serializable { /** Returns the crawl month. */ def getCrawlMonth: String - /** Returns the content of the record as an array of bytes. */ - def getContentBytes: Array[Byte] - /** Returns the content of the record as a String. */ def getContentString: String diff --git a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala index 47f845bb..adbce764 100644 --- a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala +++ b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala @@ -26,10 +26,14 @@ import org.archive.webservices.sparkling.util.{ManagedVal, ValueSupplier} import org.archive.webservices.sparkling.warc.{WarcHeaders, WarcRecord} import scala.util.Try +/** Set MaxStringByteLength for SparklingArchiveRecord. */ object SparklingArchiveRecord { val MaxStringByteLength: Int = 1024 } +/** + * Extension of ArchiveRecord via Sparkling. + */ class SparklingArchiveRecord( filename: String, meta: WarcRecord, @@ -57,14 +61,13 @@ class SparklingArchiveRecord( } override def getArchiveFilename: String = filename + override def getCrawlDate: String = meta.timestamp.filter(_.length >= 14).map(_.take(14)).getOrElse("") + override def getCrawlMonth: String = warc.timestamp.filter(_.length >= 6).map(_.take(6)).getOrElse("") - override def getContentBytes: Array[Byte] = - Try { - payload(warc) - }.getOrElse(Array.empty) + override def getContentString: String = Try { val record = @@ -76,17 +79,23 @@ class SparklingArchiveRecord( } .getOrElse(new String(payload(record))) }.getOrElse("") + override def getMimeType: String = http(warc).flatMap(_.mime).getOrElse("unknown") + override def getUrl: String = warc.url.getOrElse("").replaceAll("<|>", "") + override def getDomain: String = ExtractDomain(getUrl) + override def getBinaryBytes: Array[Byte] = Try { var record = warc http(record).map(_.body).map(IOUtil.bytes).getOrElse(payload(record)) }.getOrElse(Array.empty) + override def getHttpStatus: String = http(warc).map(_.status.toString).getOrElse("000") + override def getPayloadDigest: String = Try { meta.payloadDigest.orElse(warc.digestPayload()).getOrElse("") diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index bac71532..8b5d15cf 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -129,7 +129,7 @@ package object archivesunleashed { } /** - * A Wrapper class around DF to allow Dfs of type ARCRecord and WARCRecord to be queried via a fluent API. + * A Wrapper class around DF to allow Dfs of type ArchiveRecord to be queried via a fluent API. * * To load such an DF, please use [[RecordLoader]] and apply .all() on it. */ @@ -155,7 +155,7 @@ package object archivesunleashed { } /** - * A Wrapper class around RDD to allow RDDs of type ARCRecord and WARCRecord to be queried via a fluent API. + * A Wrapper class around RDD to allow RDDs of type ArchiveRecord to be queried via a fluent API. * * To load such an RDD, please see [[RecordLoader]]. */ @@ -163,7 +163,7 @@ package object archivesunleashed { extends java.io.Serializable { /* Creates a column for Bytes as well in Dataframe. - Call KeepImages OR KeepValidPages on RDD depending upon the requirement before calling this method */ + Call KeepImages OR KeepValidPages on RDD depending upon the requirement before calling this method. */ def all(): DataFrame = { val records = rdd .removeFiledesc() @@ -570,6 +570,7 @@ package object archivesunleashed { .map(r => (r, (DetectMimeTypeTika(r.getBinaryBytes)))) .filter(r => r._2 == "application/vnd.ms-powerpoint" + || r._2 == "application/vnd.apple.keynote" || r._2 == "application/vnd.openxmlformats-officedocument.presentationml.presentation" || r._2 == "application/vnd.oasis.opendocument.presentation" || r._2 == "application/vnd.oasis.opendocument.presentation-template" From f04b841a78e2f52d58a94680947c50f2914aaecd Mon Sep 17 00:00:00 2001 From: nruest Date: Sun, 22 May 2022 12:15:41 -0400 Subject: [PATCH 09/11] Remove shadowed AU tika-parsers dependency, and use org.tika again. --- pom.xml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pom.xml b/pom.xml index b3dbb1b6..9748211f 100644 --- a/pom.xml +++ b/pom.xml @@ -67,10 +67,6 @@ maven https://repo.maven.apache.org/maven2/ - - mvn-repo - https://raw.githubusercontent.com/archivesunleashed/aut-resources/master/mvn-repo - jitpack.io https://jitpack.io @@ -512,10 +508,7 @@ ${tika.version} - - com.github.archivesunleashed.tika tika-parsers ${tika.version} From dbeab4d573e53b88db31966a79027ad0404ef900 Mon Sep 17 00:00:00 2001 From: nruest Date: Sun, 22 May 2022 13:13:34 -0400 Subject: [PATCH 10/11] Add domain column to webpages(). --- src/main/scala/io/archivesunleashed/package.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index 8b5d15cf..b4f26aad 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -223,6 +223,7 @@ package object archivesunleashed { .map(r => Row( r.getCrawlDate, + ExtractDomain(r.getUrl).replaceAll("^\\s*www\\.", ""), r.getUrl, r.getMimeType, DetectMimeTypeTika(r.getBinaryBytes), @@ -233,6 +234,7 @@ package object archivesunleashed { val schema = new StructType() .add(StructField("crawl_date", StringType, true)) + .add(StructField("domain", StringType, true)) .add(StructField("url", StringType, true)) .add(StructField("mime_type_web_server", StringType, true)) .add(StructField("mime_type_tika", StringType, true)) From e75d1b15a8f0421150be6cb3797a87f0fe538f77 Mon Sep 17 00:00:00 2001 From: nruest Date: Mon, 23 May 2022 14:01:23 -0400 Subject: [PATCH 11/11] Update tests for %534 --- .../io/archivesunleashed/df/DataFrameLoaderTest.scala | 4 +++- src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala | 7 +++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala index 9180c634..d021006d 100644 --- a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala +++ b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala @@ -36,6 +36,7 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { private val master = "local[4]" private val appName = "example-df" private var sc: SparkContext = _ + private val domain = "domain" private val url = "url" private val mime_type = "mime_type_web_server" private val md5 = "md5" @@ -61,7 +62,8 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { val word = df.wordProcessorFiles(docPath) val all = df.all(arcPath) - val r_1 = validPages.select(url, mime_type).take(1)(0) + val r_1 = validPages.select(domain, url, mime_type).take(1)(0) + assert(r_1.getAs[String](domain) == "archive.org") assert(r_1.getAs[String](url) == "http://www.archive.org/") assert(r_1.getAs[String](mime_type) == "text/html") diff --git a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala index 5a6574c6..7088bf09 100644 --- a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala +++ b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala @@ -51,15 +51,14 @@ class SimpleDfTest extends FunSuite with BeforeAndAfter { // scalastyle:on val results = df - .select(extractDomain($"Url").as("Domain")) - .groupBy("Domain") + .groupBy($"domain") .count() - .orderBy(desc("count")) + .sort($"count".desc) .head(3) // Results should be: // +------------------+-----+ - // | Domain|count| + // | domain|count| // +------------------+-----+ // | archive.org| 91| // | deadlists.com| 2|