ElixirRadar Jobs

Mix.install([
  {:spider_man, "~> 0.3"},
  {:floki, "~> 0.31"},
  {:nimble_csv, "~> 1.1"},
  {:kino, "~> 0.8.0"}
])

Summary

Using the excellent spider_man library, we want to get the paginated list of job opportunities from https://elixir-radar.com/jobs. We also use floki to grab the relevant parts from the HTML and nimble_csv to sort our CSV results.

citation needed SpiderMan works by spawning a process for each page and as such, results come in as the processing completes. This means that page 500 could complete before page 2 so we sort the CSV results to keep the newest entries towards the top.

Since Elixir Radar doesn't include dates in their entries, we sort by page number.

Noted Variables and Defaults

base_url = https://elixir-radar.com/jobs
1. Sets the base url for all other relative paths
total_page = 6
1. We hardcode our page count to 6 to roughly get the last 6 months.
ETS file path = ./data/radar_jobs.ets
1. Erlang term storage file. We don't work with this.
CSV file path = ./data/radar_jobs.csv
1. Original CSV file, records are saved as they are processed.
CSV sorted file path = ./data/radar_jobs-sorted.csv

Specifications

Analyze page numbers to hardcode grabbing anything later than ~6 months ago.
1. This should be 6 pages, it may need to change in the future.
Map job items
1. .job-board ul > li
Gather fields
1. title
2. company
3. location
4. workplace
5. source_description
6. post_url
7. page_number
Delete ETS and CSV file before next run
Sort CSV file by page_number

Configure Settings

Build settings for spider

base_url = "https://elixir-radar.com/jobs"

requester_options = [
  base_url: base_url,
  middlewares: [
    {SpiderMan.Middleware.UserAgent,
     [
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
     ]},
    {Tesla.Middleware.Headers,
     [
       {"referer", base_url},
       {"accept",
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"},
       {"accept-encoding", "gzip, deflate"},
       {"accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7"}
     ]},
    Tesla.Middleware.DecompressResponse
  ]
]

settings = [
  log2file: false,
  downloader_options: [requester: {SpiderMan.Requester.Finch, requester_options}],
  spider_options: [pipelines: []],
  item_processor_options: [
    storage: [
      {SpiderMan.Storage.ETS, "./data/radar_jobs.ets"},
      {SpiderMan.Storage.CSV,
       file: "./data/radar_jobs.csv",
       headers: [
         :title,
         :company,
         :location,
         :workplace,
         :source_description,
         :post_url,
         :page_number
       ]}
    ]
  ]
]

Configure Parsing

Prepare callbacks for spider

import SpiderMan
import SpiderMan.Utils
require Logger

spider = SpiderList.ElixirRadar

init = fn state ->
  build_request(base_url)
  |> set_flag(:first_page)
  |> then(&SpiderMan.insert_request(spider, &1))

  state
end

handle_list_page = fn body, n ->
  Logger.info("processing page #{n}")
  {:ok, document} = Floki.parse_document(body)

  jobs =
    Floki.find(document, ".job-board ul")
    |> hd()
    |> Floki.children(include_text: false)
    |> Enum.filter(&match?({"li", _, _}, &1))

  items =
    Enum.map(jobs, fn job ->
      title = Floki.find(job, ".job-post-link") |> Floki.text() |> String.trim()
      post_url = Floki.attribute(job, ".job-post-link", "href") |> hd()

      full_location =
        Floki.find(job, ".job-board-job-location")
        |> Floki.text()
        |> String.trim()
        |> String.replace(~r/\s+/, " ")
        |> String.replace(~r/\t/, " ")

      description =
        Floki.find(job, ".job-board-job-description")
        |> Floki.text()
        |> String.trim()
        |> String.replace(~r/\s+/, " ")
        |> String.replace(~r/\t/, " ")

      [company_parts, location | _] = full_location |> String.split(" - ", trim: true)

      [workplace, company] =
        case Regex.run(~r/(\(Remote\)) (.*)/, company_parts, capture: :all_but_first) do
          [_, company] ->
            ["Remote", company]

          nil ->
            company = Regex.run(~r/(.*)/, company_parts, capture: :all_but_first)
            ["Onsite", company]
        end

      build_item(
        post_url,
        %{
          title: title,
          company: company,
          location: location,
          workplace: workplace,
          source_description: description,
          post_url: post_url,
          page_number: n
        }
      )
    end)

  %{items: items}
end

handle_response = fn
  %{env: env, flag: :first_page}, _context ->
    # total_page =
    #   Regex.run(~r/Showing page 1 of (\d+)/, env.body, capture: :all_but_first)
    #   |> hd()
    #   |> String.to_integer()
    total_page = 6

    Logger.info("total: #{total_page}")

    requests =
      Enum.map(2..total_page, fn n ->
        build_request("/?page=#{n}")
        |> set_flag({:list_page, n})
      end)

    handle_list_page.(env.body, 1)
    |> Map.put(:requests, requests)

  %{env: env, flag: {:list_page, n}}, _context ->
    handle_list_page.(env.body, n)
end

callbacks = [init: init, handle_response: handle_response]
{:ok, settings} = SpiderMan.CommonSpider.check_callbacks_and_merge_settings(callbacks, settings)

Executing

Run the spider

# Delete previous dumps
File.rm_rf("./data/radar_jobs.csv")
File.rm_rf("./data/radar_jobs.ets")
SpiderMan.run_until_zero(spider, settings, 5_000)

Sorting the Results

Sort the csv by page number ascending

alias NimbleCSV.RFC4180, as: CSV

headers = [
  :title,
  :company,
  :location,
  :workplace,
  :source_description,
  :post_url,
  :page_number
]

sorted_path = "./data/radar_jobs-sorted.csv"
File.rm_rf(sorted_path)
io_device = File.open!(sorted_path, [:write, :append, :binary, :utf8])

header = CSV.dump_to_iodata([headers])

csv =
  "./data/radar_jobs.csv"
  |> File.read!()
  |> CSV.parse_string()
  |> Enum.sort_by(&List.last(&1), :asc)
  |> CSV.dump_to_iodata()

:ok = IO.write(io_device, header)
:ok = IO.write(io_device, csv)
:ok = File.close(io_device)

Display the Sorted CSV

data =
  "./data/radar_jobs-sorted.csv"
  |> File.read!()
  |> CSV.parse_string()

data
|> Enum.with_index()
|> Enum.map(fn {row, index} ->
  %{
    id: index,
    title: Enum.at(row, 0),
    company: Enum.at(row, 1),
    location: Enum.at(row, 2),
    workplace: Enum.at(row, 3),
    source_description: Enum.at(row, 4),
    post_url: Enum.at(row, 5),
    page_number: Enum.at(row, 6)
  }
end)
|> Kino.DataTable.new()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

spiderman--elixir_radar_jobs.livemd

spiderman--elixir_radar_jobs.livemd

ElixirRadar Jobs

Summary

Noted Variables and Defaults

Specifications

Configure Settings

Configure Parsing

Executing

Sorting the Results

Display the Sorted CSV

Files

spiderman--elixir_radar_jobs.livemd

Latest commit

History

spiderman--elixir_radar_jobs.livemd

File metadata and controls

ElixirRadar Jobs

Summary

Noted Variables and Defaults

Specifications

Configure Settings

Configure Parsing

Executing

Sorting the Results

Display the Sorted CSV