Mix.install([
{:spider_man, "~> 0.3"},
{:floki, "~> 0.31"},
{:nimble_csv, "~> 1.1"},
{:kino, "~> 0.8.0"}
])
Using the excellent spider_man
library, we want to get the paginated list of job opportunities from https://elixir-radar.com/jobs
. We also use floki
to grab the relevant parts from the HTML and nimble_csv
to sort our CSV results.
citation needed SpiderMan works by spawning a process for each page and as such, results come in as the processing completes. This means that page 500 could complete before page 2 so we sort the CSV results to keep the newest entries towards the top.
Since Elixir Radar
doesn't include dates in their entries, we sort by page number.
base_url = https://elixir-radar.com/jobs
- Sets the base url for all other relative paths
total_page = 6
- We hardcode our page count to 6 to roughly get the last 6 months.
- ETS file path =
./data/radar_jobs.ets
- Erlang term storage file. We don't work with this.
- CSV file path =
./data/radar_jobs.csv
- Original CSV file, records are saved as they are processed.
- CSV sorted file path =
./data/radar_jobs-sorted.csv
- Analyze page numbers to hardcode grabbing anything later than ~6 months ago.
- This should be 6 pages, it may need to change in the future.
- Map job items
.job-board ul > li
- Gather fields
-
title
-
company
-
location
-
workplace
-
source_description
-
post_url
-
page_number
-
- Delete ETS and CSV file before next run
- Sort CSV file by
page_number
Build settings for spider
base_url = "https://elixir-radar.com/jobs"
requester_options = [
base_url: base_url,
middlewares: [
{SpiderMan.Middleware.UserAgent,
[
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4389.82 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4389.82 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
]},
{Tesla.Middleware.Headers,
[
{"referer", base_url},
{"accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"},
{"accept-encoding", "gzip, deflate"},
{"accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7"}
]},
Tesla.Middleware.DecompressResponse
]
]
settings = [
log2file: false,
downloader_options: [requester: {SpiderMan.Requester.Finch, requester_options}],
spider_options: [pipelines: []],
item_processor_options: [
storage: [
{SpiderMan.Storage.ETS, "./data/radar_jobs.ets"},
{SpiderMan.Storage.CSV,
file: "./data/radar_jobs.csv",
headers: [
:title,
:company,
:location,
:workplace,
:source_description,
:post_url,
:page_number
]}
]
]
]
Prepare callbacks for spider
import SpiderMan
import SpiderMan.Utils
require Logger
spider = SpiderList.ElixirRadar
init = fn state ->
build_request(base_url)
|> set_flag(:first_page)
|> then(&SpiderMan.insert_request(spider, &1))
state
end
handle_list_page = fn body, n ->
Logger.info("processing page #{n}")
{:ok, document} = Floki.parse_document(body)
jobs =
Floki.find(document, ".job-board ul")
|> hd()
|> Floki.children(include_text: false)
|> Enum.filter(&match?({"li", _, _}, &1))
items =
Enum.map(jobs, fn job ->
title = Floki.find(job, ".job-post-link") |> Floki.text() |> String.trim()
post_url = Floki.attribute(job, ".job-post-link", "href") |> hd()
full_location =
Floki.find(job, ".job-board-job-location")
|> Floki.text()
|> String.trim()
|> String.replace(~r/\s+/, " ")
|> String.replace(~r/\t/, " ")
description =
Floki.find(job, ".job-board-job-description")
|> Floki.text()
|> String.trim()
|> String.replace(~r/\s+/, " ")
|> String.replace(~r/\t/, " ")
[company_parts, location | _] = full_location |> String.split(" - ", trim: true)
[workplace, company] =
case Regex.run(~r/(\(Remote\)) (.*)/, company_parts, capture: :all_but_first) do
[_, company] ->
["Remote", company]
nil ->
company = Regex.run(~r/(.*)/, company_parts, capture: :all_but_first)
["Onsite", company]
end
build_item(
post_url,
%{
title: title,
company: company,
location: location,
workplace: workplace,
source_description: description,
post_url: post_url,
page_number: n
}
)
end)
%{items: items}
end
handle_response = fn
%{env: env, flag: :first_page}, _context ->
# total_page =
# Regex.run(~r/Showing page 1 of (\d+)/, env.body, capture: :all_but_first)
# |> hd()
# |> String.to_integer()
total_page = 6
Logger.info("total: #{total_page}")
requests =
Enum.map(2..total_page, fn n ->
build_request("/?page=#{n}")
|> set_flag({:list_page, n})
end)
handle_list_page.(env.body, 1)
|> Map.put(:requests, requests)
%{env: env, flag: {:list_page, n}}, _context ->
handle_list_page.(env.body, n)
end
callbacks = [init: init, handle_response: handle_response]
{:ok, settings} = SpiderMan.CommonSpider.check_callbacks_and_merge_settings(callbacks, settings)
Run the spider
# Delete previous dumps
File.rm_rf("./data/radar_jobs.csv")
File.rm_rf("./data/radar_jobs.ets")
SpiderMan.run_until_zero(spider, settings, 5_000)
Sort the csv by page number ascending
alias NimbleCSV.RFC4180, as: CSV
headers = [
:title,
:company,
:location,
:workplace,
:source_description,
:post_url,
:page_number
]
sorted_path = "./data/radar_jobs-sorted.csv"
File.rm_rf(sorted_path)
io_device = File.open!(sorted_path, [:write, :append, :binary, :utf8])
header = CSV.dump_to_iodata([headers])
csv =
"./data/radar_jobs.csv"
|> File.read!()
|> CSV.parse_string()
|> Enum.sort_by(&List.last(&1), :asc)
|> CSV.dump_to_iodata()
:ok = IO.write(io_device, header)
:ok = IO.write(io_device, csv)
:ok = File.close(io_device)
data =
"./data/radar_jobs-sorted.csv"
|> File.read!()
|> CSV.parse_string()
data
|> Enum.with_index()
|> Enum.map(fn {row, index} ->
%{
id: index,
title: Enum.at(row, 0),
company: Enum.at(row, 1),
location: Enum.at(row, 2),
workplace: Enum.at(row, 3),
source_description: Enum.at(row, 4),
post_url: Enum.at(row, 5),
page_number: Enum.at(row, 6)
}
end)
|> Kino.DataTable.new()