From ca6b8d4fc234913d8746876c43cef7216be7b4d7 Mon Sep 17 00:00:00 2001 From: nruest Date: Sun, 22 May 2022 14:07:42 -0400 Subject: [PATCH] Documentation updates for https://github.com/archivesunleashed/aut/pull/533 & https://github.com/archivesunleashed/aut/issues/534 --- docs/auk-derivatives.md | 39 ++++++++--- docs/binary-analysis.md | 14 +++- docs/dataframe-schemas.md | 1 + docs/extract-binary-info.md | 58 +++++++++++++--- docs/image-analysis.md | 14 +++- docs/link-analysis.md | 77 ++++++++++++++++++--- docs/text-analysis.md | 133 ++++++++++++++++++++++++++++++------ docs/toolkit-walkthrough.md | 35 ++++++++-- 8 files changed, 313 insertions(+), 58 deletions(-) diff --git a/docs/auk-derivatives.md b/docs/auk-derivatives.md index deb84404..e48215a8 100644 --- a/docs/auk-derivatives.md +++ b/docs/auk-derivatives.md @@ -38,15 +38,24 @@ val webpages = RecordLoader.loadArchives("/path/to/data", sc) val webgraph = RecordLoader.loadArchives("/path/to/data", sc) .webgraph() -// Domains file. -webpages.groupBy(removePrefixWWW(extractDomain($"Url")).alias("url")) +// Domains frequency file. +webpages.groupBy($"domain") .count() .sort($"count".desc) - .write.csv("/path/to/derivatives/auk/all-domains/output") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/derivatives/auk/all-domains/output") // Full-text. -webpages.select($"crawl_date", removePrefixWWW(extractDomain(($"url")).alias("domain")), $"url", $"content") - .write.csv("/path/to/derivatives/auk/full-text/output") +webpages.write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/derivatives/auk/full-text/output") // GraphML val graph = webgraph.groupBy( @@ -77,14 +86,24 @@ webpages = WebArchive(sc, sqlContext, "/path/to/data").webpages() webgraph = WebArchive(sc, sqlContext, "/path/to/data").webgraph() # Domains file. -webpages.groupBy(remove_prefix_www(extract_domain("url")).alias("url")) \ +webpages.groupBy("domain") \ .count() \ - .sort(col("count").desc()) \ - .write.csv("/path/to/derivatives/auk/all-domains/output") + .sort(col("count")\ + .desc()) \ + .write\ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save("/path/to/derivatives/auk/all-domains/output") # Full-text. -webpages.select("crawl_date", remove_prefix_www(extract_domain("url")).alias("domain"), "url", "content")\ - .write.csv("/path/to/derivatives/auk/full-text/output") +webpages.write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save("/path/to/derivatives/auk/full-text/output") # Create DataFrame for GraphML output graph = webgraph.groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src_domain"), remove_prefix_www(extract_domain("dest")).alias("dest_domain"))\ diff --git a/docs/binary-analysis.md b/docs/binary-analysis.md index 44be43e7..d41bd1fb 100644 --- a/docs/binary-analysis.md +++ b/docs/binary-analysis.md @@ -442,7 +442,12 @@ val result = images.join(links, "md5") .groupBy("domain", "md5") .agg(first("url").as("image_url")) .orderBy(asc("md5")) - .write.csv("/path/to/output") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/output") ``` ### PythonDF @@ -463,7 +468,12 @@ result = images.join(links, "md5") \ .groupBy("domain", "md5") \ .agg(first("url").alias("image_url")) \ .orderBy(asc("md5")) \ - .write.csv("/path/to/output") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/output") ``` ## Extract PDF Information diff --git a/docs/dataframe-schemas.md b/docs/dataframe-schemas.md index 66b39145..016538cf 100644 --- a/docs/dataframe-schemas.md +++ b/docs/dataframe-schemas.md @@ -28,6 +28,7 @@ language); and `.webgraph()` which includes hyperlink information. - `crawl_date` (string) - `url` (string) +- `url` (string) - `mime_type_web_server` (string) - `mime_type_tika` (string) - `language` (string) diff --git a/docs/extract-binary-info.md b/docs/extract-binary-info.md index c67a8cd0..f19e99fa 100644 --- a/docs/extract-binary-info.md +++ b/docs/extract-binary-info.md @@ -46,7 +46,12 @@ val warcsS3 = RecordLoader.loadArchives("s3a://your-data-bucket/", sc) warcs.audio() .select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5") .orderBy(desc("md5")) - .write.csv("/path/to/derivatives/csv/audio") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/derivatives/csv/audio") // Images. warcsS3.images() @@ -58,7 +63,12 @@ warcsS3.images() warcs.pdfs() .select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5") .orderBy(desc("md5")) - .write.csv("s3a://your-derivatives-bucket/csv/pdf") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("s3a://your-derivatives-bucket/csv/pdf") // Presentation Program Files. warcs.presentationProgramFiles() @@ -70,13 +80,23 @@ warcs.presentationProgramFiles() warcs.spreadsheets() .select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5") .orderBy(desc("md5")) - .write.csv("/path/to/derivatives/csv/spreadsheet") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/derivatives/csv/spreadsheet") // Videos. warcs.videos() .select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5") .orderBy(desc("md5")) - .write.csv("/path/to/derivatives/csv/video") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/derivatives/csv/video") // Word Processor Files. warcs.wordProcessorFiles() @@ -98,26 +118,41 @@ warcs = WebArchive(sc, sqlContext, "/path/to/aut-resources-master/Sample-Data/*g # Choose your format: CSV or Parquet. # For CSV: -# .write.csv('/path/to/derivatives/csv/audio', header='true') +# .write.csv('/path/to/derivatives/csv/audio') # Include header='true' if you want headers. # For Parquet: # .write.parquet("/path/to/derivatives/parquet/pages/") # Audio Files. -warcs.audio().write.csv('/path/to/derivatives/csv/audio', header='true') +warcs.audio().write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save('/path/to/derivatives/csv/audio') # Images. warcs.images().write.parquet('/path/to/derivatives/parquet/images') # Image Links. -warcs.image_links().write.csv('/path/to/derivatives/csv/images-links', header='true') +warcs.image_links().write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save('/path/to/derivatives/csv/images-links') # PDFs. warcs.pdfs().write.parquet('/path/to/derivatives/csv/pdfs') # Spreadsheets. -warcs.spreadsheets().write.csv('/path/to/derivatives/csv/spreadsheets', header='true') +warcs.spreadsheets().write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save('/path/to/derivatives/csv/spreadsheets') # Presentation Program Files. warcs.presentation_program().write.parquet('/path/to/derivatives/csv/presentation_program') @@ -126,5 +161,10 @@ warcs.presentation_program().write.parquet('/path/to/derivatives/csv/presentatio warcs.video().write.parquet('/path/to/derivatives/csv/video') # Word Processor Files. -warcs.word_processor().write.csv('/path/to/derivatives/csv/word_processor', header='true') +warcs.word_processor().write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save('/path/to/derivatives/csv/word_processor') ``` diff --git a/docs/image-analysis.md b/docs/image-analysis.md index 10e6ea19..f676537b 100644 --- a/docs/image-analysis.md +++ b/docs/image-analysis.md @@ -321,7 +321,12 @@ val result = images.join(links, "md5") .groupBy("domain", "md5") .agg(first("url").as("image_url")) .orderBy(asc("md5")) - .write.csv("/path/to/output") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/path/to/output") ``` ### PythonDF @@ -342,5 +347,10 @@ result = images.join(links, "md5") \ .groupBy("domain", "md5") \ .agg(first("url").alias("image_url")) \ .orderBy(asc("md5")) \ - .write.csv("/path/to/output") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("/path/to/output") ``` diff --git a/docs/link-analysis.md b/docs/link-analysis.md index 6763de53..865ffdff 100644 --- a/docs/link-analysis.md +++ b/docs/link-analysis.md @@ -69,7 +69,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc) .groupBy(removePrefixWWW(extractDomain($"src")).as("src"), removePrefixWWW(extractDomain($"dest")).as("dest")) .count() .filter($"count" > 5) - .write.csv("links-all-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("links-all-df/") ``` ```scala @@ -86,7 +91,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc) .groupBy("src", "dest") .count() .filter($"count" > 5) - .write.csv("links-all-apple-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("links-all-apple-df/") ``` ### Python DF @@ -105,7 +115,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \ .groupBy("src", "dest") \ .count() \ .filter(col("count") > 5) \ - .write.csv("links-all-apple-df/") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("links-all-apple-df/") ``` ## Extract Raw URL Link Structure @@ -153,7 +168,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc) .groupBy(extractDomain($"src"), extractDomain($"dest")) .count() .filter($"count" > 5) - .write.csv("full-links-all-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("full-links-all-df/") ``` ### Python DF @@ -167,7 +187,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \ .groupBy(extract_domain("src"), extract_domain("dest")) \ .count() \ .filter(col("count") > 5) \ - .write.csv("full-links-all-df/") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("full-links-all-df/") ``` ## Organize Links by URL Pattern @@ -209,7 +234,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc) .groupBy("src", "dest") .count() .filter($"count" > 5) - .write.csv("details-links-all-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("details-links-all-df/") ``` ### Python DF @@ -228,7 +258,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \ .groupBy("src", "dest") \ .count() \ .filter(col("count") > 5) \ - .write.csv("details-links-all-df/") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("details-links-all-df/") ``` ## Organize Links by Crawl Date @@ -293,7 +328,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc) .groupBy($"crawl_date", removePrefixWWW(extractDomain($"src")), removePrefixWWW(extractDomain($"dest"))) .count() .filter($"count" > 5) - .write.csv("sitelinks-by-date-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("sitelinks-by-date-df/") ``` ### Python DF @@ -307,7 +347,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \ .groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src"), remove_prefix_www(extract_domain("dest")).alias("dest")) \ .count() \ .filter(col("count") > 5) \ - .write.csv("sitelinks-by-date-df/") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("sitelinks-by-date-df/") ``` ## Filter by URL @@ -348,7 +393,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc) .groupBy("src", "dest") .count() .filter($"count" > 5) - .write.csv("sitelinks-details-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("sitelinks-details-df/") ``` ### Python DF @@ -367,7 +417,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \ .groupBy("src", "dest") \ .count() \ .filter(col("count") > 5) \ - .write.csv("sitelinks-details-df/") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("sitelinks-details-df/") ``` ## Export to Gephi diff --git a/docs/text-analysis.md b/docs/text-analysis.md index 502f19a1..19b9246f 100644 --- a/docs/text-analysis.md +++ b/docs/text-analysis.md @@ -33,7 +33,12 @@ import io.archivesunleashed.udfs._ RecordLoader.loadArchives("/path/to/warcs", sc) .webpages() .select($"crawl_date", extractDomain($"url"), $"url", removeHTML($"content")) - .write.csv("plain-text-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-df/") ``` ### Python DF @@ -44,7 +49,12 @@ from aut import * WebArchive(sc, sqlContext, "/path/to/warcs") \ .webpages() \ .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html("content")) \ - .write.csv("plain-text-df/") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-text-df/") ``` ## Extract Plain Text Without HTTP Headers @@ -76,7 +86,12 @@ import io.archivesunleashed.udfs._ RecordLoader.loadArchives("/path/to/warcs", sc) .webpages() .select(removeHTML(removeHTTPHeader($"content"))) - .write.csv("plain-text-noheaders-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-noheaders-df/") ``` ### Python DF @@ -87,7 +102,12 @@ from aut import * WebArchive(sc, sqlContext, "/path/to/warcs") \ .webpages() \ .select(remove_html(remove_http_header("content"))) \ - .write.csv("plain-text-noheaders-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\ + .format("csv")\ + .option("escape", "\"")\ + .option("encoding", "utf-8")\ + .save("plain-text-noheaders-df/") ``` ## Extract Plain Text By Domain @@ -122,7 +142,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc) .webpages() .select($"crawl_date", extractDomain($"url").alias("domain"), $"url", removeHTML(removeHTTPHeader($"content"))) .filter(hasDomains($"domain", lit(domains))) - .write.csv("plain-text-domain-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-domain-df/") ``` ### Python DF @@ -137,7 +162,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \ .webpages() \ .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content"))) \ .filter(col("domain").isin(domains)) \ - .write.csv("plain-text-domain-df/") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-text-domain-df/") ``` ## Extract Plain Text by URL Pattern @@ -174,7 +204,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc) .webpages() .select($"crawl_date", extractDomain($"url").alias("domain"), $"url", removeHTML(removeHTTPHeader($"content"))) .filter(hasUrlPatterns($"url", lit(urlPattern))) - .write.csv("details-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("details-df/") ``` ### Python DF @@ -189,7 +224,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \ .webpages() \ .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content"))) \ .filter(col("url").like(url_pattern)) \ - .write.csv("details-df/") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("details-df/") ``` ## Extract Plain Text Minus Boilerplate @@ -225,7 +265,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc) .webpages() .select($"crawl_date", extractDomain($"url"), $"url", extractBoilerpipeText(removeHTTPHeader($"content"))) .filter(hasDomains($"domain", lit(domains))) - .write.csv("plain-text-no-boilerplate-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-no-boilerplate-df/") ``` ### Python DF @@ -236,7 +281,12 @@ from aut import * WebArchive(sc, sqlContext, "/path/to/warcs") \ .webpages() \ .select("crawl_date", extract_domain("url").alias("domain"), "url", extract_boilerplate(remove_http_header("content")).alias("content")) \ - .write.csv("plain-text-no-boilerplate-df/") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-text-no-boilerplate-df/") ``` ## Extract Plain Text Filtered by Date @@ -313,7 +363,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc) .webpages() .select($"crawl_date", extractDomain($"url").as("domain"), $"url", removeHTML(removeHTTPHeader($"content"))) .filter(hasDate($"crawl_date", lit(dates))) - .write.csv("plain-text-date-filtered-2008-2015-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-date-filtered-2008-2015-df/") ``` ### Python DF @@ -328,7 +383,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \ .webpages() \ .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content"))) \ .filter(col("crawl_date").rlike(dates)) \ - .write.csv("plain-text-date-filtered-2008-2015-df/") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-text-date-filtered-2008-2015-df/") ``` ## Extract Plain Text Filtered by Language @@ -365,7 +425,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc) .select($"crawl_date", extractDomain($"url").alias("domain"), $"url", $"language", removeHTML(removeHTTPHeader($"content"))) .filter(hasDomains($"domain", lit(domains))) .filter(hasLanguages($"language", lit(languages))) - .write.csv("plain-text-fr-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-fr-df/") ``` ```scala @@ -380,7 +445,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc) .filter(hasDomains(extractDomain($"url"), lit(domains))) .filter(hasLanguages($"language", lit(languages))) .select($"crawl_date", extractDomain($"url").alias("domain"), $"url", $"language", removeHTML(removeHTTPHeader($"content"))) - .write.csv("plain-text-fr-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-fr-df/") ``` ### Python DF @@ -397,7 +467,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \ .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content"))) \ .filter(col("domain").isin(domains)) \ .filter(col("language").isin(languages)) \ - .write.csv("plain-text-fr-df/") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-text-fr-df/") ``` ## Extract Plain text Filtered by Keyword @@ -437,7 +512,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc) .webpages() .select($"crawl_date", extractDomain($"url").alias("domain"), $"url", removeHTML(removeHTTPHeader($"content"))) .filter(hasContent($"content", lit(content))) - .write.csv("plain-text-radio-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-text-radio-df/") ``` ### Python DF @@ -452,7 +532,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \ .webpages() \ .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_html(remove_http_header("content"))) \ .filter(col("content").like(content)) \ - .write.csv("plain-text-radio-df/") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-text-radio-df/") ``` ## Extract Raw HTML @@ -485,7 +570,12 @@ import io.archivesunleashed.udfs._ RecordLoader.loadArchives("example.warc.gz", sc) .webpages() .select($"crawl_date", extractDomain($"url"), $"url", removeHTTPHeader($"content")) - .write.csv("plain-html-df/") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("plain-html-df/") ``` ### Python DF @@ -496,5 +586,10 @@ from aut import * WebArchive(sc, sqlContext, "/path/to/warcs") \ .webpages() \ .select("crawl_date", extract_domain("url").alias("domain"), "url", remove_http_header("content")) \ - .write.csv("plain-html-df/") + .write \ + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \ + .format("csv") \ + .option("escape", "\"") \ + .option("encoding", "utf-8") \ + .save("plain-html-df/") ``` diff --git a/docs/toolkit-walkthrough.md b/docs/toolkit-walkthrough.md index 02bcaa37..919b4334 100644 --- a/docs/toolkit-walkthrough.md +++ b/docs/toolkit-walkthrough.md @@ -208,7 +208,12 @@ RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc) .webpages() .select($"crawl_date", extractDomain($"url").alias("domain"), $"url", $"content") .filter(hasDomains($"domain", lit(domains))) - .write.csv("/data/liberal-party-text") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/data/liberal-party-text") ``` **If you're using your own data, that's why the domain count was key!** Swap @@ -240,7 +245,12 @@ RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc) .webpages() .select($"crawl_date", extractDomain($"url").alias("domain"), $"url", $"content") .filter(hasDomains($"domain", lit(domains))) - .write.csv("/data/liberal-party-text") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/data/liberal-party-text") ``` Instead of a nice crisp feeling of success, you will see a long dump of text @@ -289,7 +299,12 @@ RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc) .filter(hasDomains(extractDomain($"url"), lit(domains))) .filter(hasLanguages($"language", lit(languages))) .select($"crawl_date", extractDomain($"url").alias("domain"), $"url", $"content") - .write.csv("/data/liberal-party-french-text") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/data/liberal-party-french-text") ``` Or if we wanted to just have pages from 2006, we would run: @@ -304,7 +319,12 @@ RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc) .webpages() .filter(hasDate($"crawl_date", lit(dates))) .select($"crawl_date", extractDomain($"url").alias("domain"), $"url", $"content") - .write.csv("/data/2006-text") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/data/2006-text") ``` Finally, if we want to remove the HTTP headers – let's say if we want to create @@ -317,7 +337,12 @@ import io.archivesunleashed.udfs._ RecordLoader.loadArchives("/aut-resources/Sample-Data/*.gz", sc) .webpages() .select($"crawl_date", extractDomain($"url").alias("domain"), $"url", $"content") - .write.csv("/data/text-no-headers") + .write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .format("csv") + .option("escape", "\"") + .option("encoding", "utf-8") + .save("/data/text-no-headers") ``` You could now try uploading one of the plain text files using a website like