Skip to content

Latest commit

 

History

History
277 lines (227 loc) · 7.36 KB

spiderman--elixir_radar_jobs.livemd

File metadata and controls

277 lines (227 loc) · 7.36 KB

ElixirRadar Jobs

Mix.install([
  {:spider_man, "~> 0.3"},
  {:floki, "~> 0.31"},
  {:nimble_csv, "~> 1.1"},
  {:kino, "~> 0.8.0"}
])

Summary

Using the excellent spider_man library, we want to get the paginated list of job opportunities from https://elixir-radar.com/jobs. We also use floki to grab the relevant parts from the HTML and nimble_csv to sort our CSV results.

citation needed SpiderMan works by spawning a process for each page and as such, results come in as the processing completes. This means that page 500 could complete before page 2 so we sort the CSV results to keep the newest entries towards the top.

Since Elixir Radar doesn't include dates in their entries, we sort by page number.

Noted Variables and Defaults

  1. base_url = https://elixir-radar.com/jobs
    1. Sets the base url for all other relative paths
  2. total_page = 6
    1. We hardcode our page count to 6 to roughly get the last 6 months.
  3. ETS file path = ./data/radar_jobs.ets
    1. Erlang term storage file. We don't work with this.
  4. CSV file path = ./data/radar_jobs.csv
    1. Original CSV file, records are saved as they are processed.
  5. CSV sorted file path = ./data/radar_jobs-sorted.csv

Specifications

  1. Analyze page numbers to hardcode grabbing anything later than ~6 months ago.
    1. This should be 6 pages, it may need to change in the future.
  2. Map job items
    1. .job-board ul > li
  3. Gather fields
    1. title
    2. company
    3. location
    4. workplace
    5. source_description
    6. post_url
    7. page_number
  4. Delete ETS and CSV file before next run
  5. Sort CSV file by page_number

Configure Settings

Build settings for spider

base_url = "https://elixir-radar.com/jobs"

requester_options = [
  base_url: base_url,
  middlewares: [
    {SpiderMan.Middleware.UserAgent,
     [
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4389.82 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
     ]},
    {Tesla.Middleware.Headers,
     [
       {"referer", base_url},
       {"accept",
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"},
       {"accept-encoding", "gzip, deflate"},
       {"accept-language", "zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7"}
     ]},
    Tesla.Middleware.DecompressResponse
  ]
]

settings = [
  log2file: false,
  downloader_options: [requester: {SpiderMan.Requester.Finch, requester_options}],
  spider_options: [pipelines: []],
  item_processor_options: [
    storage: [
      {SpiderMan.Storage.ETS, "./data/radar_jobs.ets"},
      {SpiderMan.Storage.CSV,
       file: "./data/radar_jobs.csv",
       headers: [
         :title,
         :company,
         :location,
         :workplace,
         :source_description,
         :post_url,
         :page_number
       ]}
    ]
  ]
]

Configure Parsing

Prepare callbacks for spider

import SpiderMan
import SpiderMan.Utils
require Logger

spider = SpiderList.ElixirRadar

init = fn state ->
  build_request(base_url)
  |> set_flag(:first_page)
  |> then(&SpiderMan.insert_request(spider, &1))

  state
end

handle_list_page = fn body, n ->
  Logger.info("processing page #{n}")
  {:ok, document} = Floki.parse_document(body)

  jobs =
    Floki.find(document, ".job-board ul")
    |> hd()
    |> Floki.children(include_text: false)
    |> Enum.filter(&match?({"li", _, _}, &1))

  items =
    Enum.map(jobs, fn job ->
      title = Floki.find(job, ".job-post-link") |> Floki.text() |> String.trim()
      post_url = Floki.attribute(job, ".job-post-link", "href") |> hd()

      full_location =
        Floki.find(job, ".job-board-job-location")
        |> Floki.text()
        |> String.trim()
        |> String.replace(~r/\s+/, " ")
        |> String.replace(~r/\t/, " ")

      description =
        Floki.find(job, ".job-board-job-description")
        |> Floki.text()
        |> String.trim()
        |> String.replace(~r/\s+/, " ")
        |> String.replace(~r/\t/, " ")

      [company_parts, location | _] = full_location |> String.split(" - ", trim: true)

      [workplace, company] =
        case Regex.run(~r/(\(Remote\)) (.*)/, company_parts, capture: :all_but_first) do
          [_, company] ->
            ["Remote", company]

          nil ->
            company = Regex.run(~r/(.*)/, company_parts, capture: :all_but_first)
            ["Onsite", company]
        end

      build_item(
        post_url,
        %{
          title: title,
          company: company,
          location: location,
          workplace: workplace,
          source_description: description,
          post_url: post_url,
          page_number: n
        }
      )
    end)

  %{items: items}
end

handle_response = fn
  %{env: env, flag: :first_page}, _context ->
    # total_page =
    #   Regex.run(~r/Showing page 1 of (\d+)/, env.body, capture: :all_but_first)
    #   |> hd()
    #   |> String.to_integer()
    total_page = 6

    Logger.info("total: #{total_page}")

    requests =
      Enum.map(2..total_page, fn n ->
        build_request("/?page=#{n}")
        |> set_flag({:list_page, n})
      end)

    handle_list_page.(env.body, 1)
    |> Map.put(:requests, requests)

  %{env: env, flag: {:list_page, n}}, _context ->
    handle_list_page.(env.body, n)
end

callbacks = [init: init, handle_response: handle_response]
{:ok, settings} = SpiderMan.CommonSpider.check_callbacks_and_merge_settings(callbacks, settings)

Executing

Run the spider

# Delete previous dumps
File.rm_rf("./data/radar_jobs.csv")
File.rm_rf("./data/radar_jobs.ets")
SpiderMan.run_until_zero(spider, settings, 5_000)

Sorting the Results

Sort the csv by page number ascending

alias NimbleCSV.RFC4180, as: CSV

headers = [
  :title,
  :company,
  :location,
  :workplace,
  :source_description,
  :post_url,
  :page_number
]

sorted_path = "./data/radar_jobs-sorted.csv"
File.rm_rf(sorted_path)
io_device = File.open!(sorted_path, [:write, :append, :binary, :utf8])

header = CSV.dump_to_iodata([headers])

csv =
  "./data/radar_jobs.csv"
  |> File.read!()
  |> CSV.parse_string()
  |> Enum.sort_by(&List.last(&1), :asc)
  |> CSV.dump_to_iodata()

:ok = IO.write(io_device, header)
:ok = IO.write(io_device, csv)
:ok = File.close(io_device)

Display the Sorted CSV

data =
  "./data/radar_jobs-sorted.csv"
  |> File.read!()
  |> CSV.parse_string()

data
|> Enum.with_index()
|> Enum.map(fn {row, index} ->
  %{
    id: index,
    title: Enum.at(row, 0),
    company: Enum.at(row, 1),
    location: Enum.at(row, 2),
    workplace: Enum.at(row, 3),
    source_description: Enum.at(row, 4),
    post_url: Enum.at(row, 5),
    page_number: Enum.at(row, 6)
  }
end)
|> Kino.DataTable.new()