Skip to content

Commit

Permalink
Documentation updates for archivesunleashed/aut#533 & archivesunleash…
Browse files Browse the repository at this point in the history
  • Loading branch information
ruebot committed May 22, 2022
1 parent 1059bc6 commit ca6b8d4
Show file tree
Hide file tree
Showing 8 changed files with 313 additions and 58 deletions.
39 changes: 29 additions & 10 deletions docs/auk-derivatives.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,24 @@ val webpages = RecordLoader.loadArchives("/path/to/data", sc)
val webgraph = RecordLoader.loadArchives("/path/to/data", sc)
.webgraph()

// Domains file.
webpages.groupBy(removePrefixWWW(extractDomain($"Url")).alias("url"))
// Domains frequency file.
webpages.groupBy($"domain")
.count()
.sort($"count".desc)
.write.csv("/path/to/derivatives/auk/all-domains/output")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("/path/to/derivatives/auk/all-domains/output")

// Full-text.
webpages.select($"crawl_date", removePrefixWWW(extractDomain(($"url")).alias("domain")), $"url", $"content")
.write.csv("/path/to/derivatives/auk/full-text/output")
webpages.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("/path/to/derivatives/auk/full-text/output")

// GraphML
val graph = webgraph.groupBy(
Expand Down Expand Up @@ -77,14 +86,24 @@ webpages = WebArchive(sc, sqlContext, "/path/to/data").webpages()
webgraph = WebArchive(sc, sqlContext, "/path/to/data").webgraph()

# Domains file.
webpages.groupBy(remove_prefix_www(extract_domain("url")).alias("url")) \
webpages.groupBy("domain") \
.count() \
.sort(col("count").desc()) \
.write.csv("/path/to/derivatives/auk/all-domains/output")
.sort(col("count")\
.desc()) \
.write\
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
.format("csv")\
.option("escape", "\"")\
.option("encoding", "utf-8")\
.save("/path/to/derivatives/auk/all-domains/output")

# Full-text.
webpages.select("crawl_date", remove_prefix_www(extract_domain("url")).alias("domain"), "url", "content")\
.write.csv("/path/to/derivatives/auk/full-text/output")
webpages.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")\
.format("csv")\
.option("escape", "\"")\
.option("encoding", "utf-8")\
.save("/path/to/derivatives/auk/full-text/output")

# Create DataFrame for GraphML output
graph = webgraph.groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src_domain"), remove_prefix_www(extract_domain("dest")).alias("dest_domain"))\
Expand Down
14 changes: 12 additions & 2 deletions docs/binary-analysis.md
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,12 @@ val result = images.join(links, "md5")
.groupBy("domain", "md5")
.agg(first("url").as("image_url"))
.orderBy(asc("md5"))
.write.csv("/path/to/output")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("/path/to/output")
```

### PythonDF
Expand All @@ -463,7 +468,12 @@ result = images.join(links, "md5") \
.groupBy("domain", "md5") \
.agg(first("url").alias("image_url")) \
.orderBy(asc("md5")) \
.write.csv("/path/to/output")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("/path/to/output")
```

## Extract PDF Information
Expand Down
1 change: 1 addition & 0 deletions docs/dataframe-schemas.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ language); and `.webgraph()` which includes hyperlink information.

- `crawl_date` (string)
- `url` (string)
- `url` (string)
- `mime_type_web_server` (string)
- `mime_type_tika` (string)
- `language` (string)
Expand Down
58 changes: 49 additions & 9 deletions docs/extract-binary-info.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,12 @@ val warcsS3 = RecordLoader.loadArchives("s3a://your-data-bucket/", sc)
warcs.audio()
.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5")
.orderBy(desc("md5"))
.write.csv("/path/to/derivatives/csv/audio")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("/path/to/derivatives/csv/audio")

// Images.
warcsS3.images()
Expand All @@ -58,7 +63,12 @@ warcsS3.images()
warcs.pdfs()
.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5")
.orderBy(desc("md5"))
.write.csv("s3a://your-derivatives-bucket/csv/pdf")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("s3a://your-derivatives-bucket/csv/pdf")

// Presentation Program Files.
warcs.presentationProgramFiles()
Expand All @@ -70,13 +80,23 @@ warcs.presentationProgramFiles()
warcs.spreadsheets()
.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5")
.orderBy(desc("md5"))
.write.csv("/path/to/derivatives/csv/spreadsheet")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("/path/to/derivatives/csv/spreadsheet")

// Videos.
warcs.videos()
.select($"url", $"filename", $"extension", $"mime_type_web_server", $"mime_type_tika", $"md5")
.orderBy(desc("md5"))
.write.csv("/path/to/derivatives/csv/video")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("/path/to/derivatives/csv/video")

// Word Processor Files.
warcs.wordProcessorFiles()
Expand All @@ -98,26 +118,41 @@ warcs = WebArchive(sc, sqlContext, "/path/to/aut-resources-master/Sample-Data/*g
# Choose your format: CSV or Parquet.

# For CSV:
# .write.csv('/path/to/derivatives/csv/audio', header='true')
# .write.csv('/path/to/derivatives/csv/audio')
# Include header='true' if you want headers.

# For Parquet:
# .write.parquet("/path/to/derivatives/parquet/pages/")

# Audio Files.
warcs.audio().write.csv('/path/to/derivatives/csv/audio', header='true')
warcs.audio().write \
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
.format("csv") \
.option("escape", "\"") \
.option("encoding", "utf-8") \
.save('/path/to/derivatives/csv/audio')

# Images.
warcs.images().write.parquet('/path/to/derivatives/parquet/images')

# Image Links.
warcs.image_links().write.csv('/path/to/derivatives/csv/images-links', header='true')
warcs.image_links().write \
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
.format("csv") \
.option("escape", "\"") \
.option("encoding", "utf-8") \
.save('/path/to/derivatives/csv/images-links')

# PDFs.
warcs.pdfs().write.parquet('/path/to/derivatives/csv/pdfs')

# Spreadsheets.
warcs.spreadsheets().write.csv('/path/to/derivatives/csv/spreadsheets', header='true')
warcs.spreadsheets().write \
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
.format("csv") \
.option("escape", "\"") \
.option("encoding", "utf-8") \
.save('/path/to/derivatives/csv/spreadsheets')

# Presentation Program Files.
warcs.presentation_program().write.parquet('/path/to/derivatives/csv/presentation_program')
Expand All @@ -126,5 +161,10 @@ warcs.presentation_program().write.parquet('/path/to/derivatives/csv/presentatio
warcs.video().write.parquet('/path/to/derivatives/csv/video')

# Word Processor Files.
warcs.word_processor().write.csv('/path/to/derivatives/csv/word_processor', header='true')
warcs.word_processor().write \
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
.format("csv") \
.option("escape", "\"") \
.option("encoding", "utf-8") \
.save('/path/to/derivatives/csv/word_processor')
```
14 changes: 12 additions & 2 deletions docs/image-analysis.md
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,12 @@ val result = images.join(links, "md5")
.groupBy("domain", "md5")
.agg(first("url").as("image_url"))
.orderBy(asc("md5"))
.write.csv("/path/to/output")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("/path/to/output")
```

### PythonDF
Expand All @@ -342,5 +347,10 @@ result = images.join(links, "md5") \
.groupBy("domain", "md5") \
.agg(first("url").alias("image_url")) \
.orderBy(asc("md5")) \
.write.csv("/path/to/output")
.write \
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
.format("csv") \
.option("escape", "\"") \
.option("encoding", "utf-8") \
.save("/path/to/output")
```
77 changes: 66 additions & 11 deletions docs/link-analysis.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc)
.groupBy(removePrefixWWW(extractDomain($"src")).as("src"), removePrefixWWW(extractDomain($"dest")).as("dest"))
.count()
.filter($"count" > 5)
.write.csv("links-all-df/")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("links-all-df/")
```

```scala
Expand All @@ -86,7 +91,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc)
.groupBy("src", "dest")
.count()
.filter($"count" > 5)
.write.csv("links-all-apple-df/")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("links-all-apple-df/")
```

### Python DF
Expand All @@ -105,7 +115,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \
.groupBy("src", "dest") \
.count() \
.filter(col("count") > 5) \
.write.csv("links-all-apple-df/")
.write \
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
.format("csv") \
.option("escape", "\"") \
.option("encoding", "utf-8") \
.save("links-all-apple-df/")
```

## Extract Raw URL Link Structure
Expand Down Expand Up @@ -153,7 +168,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc)
.groupBy(extractDomain($"src"), extractDomain($"dest"))
.count()
.filter($"count" > 5)
.write.csv("full-links-all-df/")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("full-links-all-df/")
```

### Python DF
Expand All @@ -167,7 +187,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \
.groupBy(extract_domain("src"), extract_domain("dest")) \
.count() \
.filter(col("count") > 5) \
.write.csv("full-links-all-df/")
.write \
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
.format("csv") \
.option("escape", "\"") \
.option("encoding", "utf-8") \
.save("full-links-all-df/")
```

## Organize Links by URL Pattern
Expand Down Expand Up @@ -209,7 +234,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc)
.groupBy("src", "dest")
.count()
.filter($"count" > 5)
.write.csv("details-links-all-df/")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("details-links-all-df/")
```

### Python DF
Expand All @@ -228,7 +258,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \
.groupBy("src", "dest") \
.count() \
.filter(col("count") > 5) \
.write.csv("details-links-all-df/")
.write \
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
.format("csv") \
.option("escape", "\"") \
.option("encoding", "utf-8") \
.save("details-links-all-df/")
```

## Organize Links by Crawl Date
Expand Down Expand Up @@ -293,7 +328,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc)
.groupBy($"crawl_date", removePrefixWWW(extractDomain($"src")), removePrefixWWW(extractDomain($"dest")))
.count()
.filter($"count" > 5)
.write.csv("sitelinks-by-date-df/")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("sitelinks-by-date-df/")
```

### Python DF
Expand All @@ -307,7 +347,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \
.groupBy("crawl_date", remove_prefix_www(extract_domain("src")).alias("src"), remove_prefix_www(extract_domain("dest")).alias("dest")) \
.count() \
.filter(col("count") > 5) \
.write.csv("sitelinks-by-date-df/")
.write \
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
.format("csv") \
.option("escape", "\"") \
.option("encoding", "utf-8") \
.save("sitelinks-by-date-df/")
```

## Filter by URL
Expand Down Expand Up @@ -348,7 +393,12 @@ RecordLoader.loadArchives("/path/to/warcs", sc)
.groupBy("src", "dest")
.count()
.filter($"count" > 5)
.write.csv("sitelinks-details-df/")
.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.format("csv")
.option("escape", "\"")
.option("encoding", "utf-8")
.save("sitelinks-details-df/")
```

### Python DF
Expand All @@ -367,7 +417,12 @@ WebArchive(sc, sqlContext, "/path/to/warcs") \
.groupBy("src", "dest") \
.count() \
.filter(col("count") > 5) \
.write.csv("sitelinks-details-df/")
.write \
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") \
.format("csv") \
.option("escape", "\"") \
.option("encoding", "utf-8") \
.save("sitelinks-details-df/")
```

## Export to Gephi
Expand Down
Loading

0 comments on commit ca6b8d4

Please sign in to comment.