larajobs

Mix.install([
  {:kino, "~> 0.8.0"},
  # {:kino_lab, "~> 0.1.0-dev", github: "jonatanklosko/kino_lab"},
  # {:kino_vega_lite, "~> 0.1.1"},
  # {:kino_db, "~> 0.1.1"},
  {:req, "~> 0.3.4"},
  {:floki, "~> 0.32"},
  {:nimble_csv, "~> 1.1"},
  {:timex, "~> 3.0"}
])

Summary

Using the excellent req library, we want to get the list of jobs from the jobs site. The bulk of our notebooks use spider man but that is generally better suited to crawling paginated results. A number of sites stuff everything into the immediate response and use javascript to show/hide the results by "page".

Specifications

Map job items
1. New jobs #app .container div (no classes applied)
2. Older jobs #app .container div (no classed applied)
Gather fields
1. date
2. title
3. company
4. location
5. tags
6. source_url
7. post_url
Store results as CSV
Sort CSV file by date
Convert date from relative textual representation to a rough date.
1. We can't be too precise unfortunately so we'll just use the first of the month, etc.

Code

Get Items from Parsing HTML

parse_date = fn date ->
  # xd
  # xw
  # xmo
  day_groups = Regex.run(~r/(\d+?)d/, date, capture: :all_but_first)
  week_groups = Regex.run(~r/(\d+?)w/, date, capture: :all_but_first)
  month_groups = Regex.run(~r/(\d+?)mo/, date, capture: :all_but_first)

  days =
    case day_groups do
      [number] -> String.to_integer(number)
      nil -> 0
    end

  weeks =
    case week_groups do
      [number] -> String.to_integer(number)
      nil -> 0
    end

  months =
    case month_groups do
      [number] -> String.to_integer(number)
      nil -> 0
    end

  # IO.inspect(days, label: "days")
  # IO.inspect(weeks, label: "weeks")
  # IO.inspect(months, label: "months")
  Timex.shift(Timex.today(), days: -days)
  |> Timex.shift(weeks: -weeks)
  |> Timex.shift(months: -months)
end

base_url = "https://larajobs.com/"

req_html = Req.new(http_errors: :raise)

html = Req.get!(req_html, url: base_url).body

{:ok, document} = Floki.parse_document(html)

jobs = Floki.find(document, "a.job-link")

items =
  Enum.map(jobs, fn job ->
    # source_url = Floki.attribute(job, ".job-link", "href")
    # post_url = Floki.attribute(job, ".job-link", "data-url")
    source_url = Floki.attribute(job, ".job-link", "href") |> hd()
    post_url = Floki.attribute(job, ".job-link", "data-url") |> hd()

    title = Floki.find(job, "p:nth-of-type(2)") |> Floki.text() |> String.trim()
    company = Floki.find(job, "p:nth-of-type(1)") |> Floki.text() |> String.trim()

    location =
      Floki.find(
        job,
        ".job-link > div > div:last-child > div:last-child > div:nth-child(1) > div:nth-child(1)"
      )
      |> Floki.text()
      |> String.trim()

    tags =
      Floki.find(
        job,
        ".job-link > div > div:last-child > div:last-child > div:last-child .text-sm"
      )
      |> Enum.map(&(&1 |> Floki.text() |> String.trim()))
      |> Enum.join(" | ")

    date =
      Floki.find(
        job,
        ".job-link > div > div:last-child > div:last-child > div:nth-child(1) > div:last-child"
      )
      |> Floki.text()
      |> String.trim()

    full_date = parse_date.(date)
    # IO.inspect(title, label: "title")
    # IO.inspect(company, label: "company")
    # IO.inspect(location, label: "location")
    # IO.inspect(tags, label: "tags")
    # IO.inspect(date, label: "date")
    # IO.inspect(source_url, label: "source_url")
    # IO.inspect(post_url, label: "post_url")
    # IO.inspect(job, label: "job")

    %{
      posted_at: full_date,
      title: title,
      company: company,
      location: location,
      tags: tags,
      source_url: base_url <> String.slice(source_url, 1..-1),
      post_url: post_url
    }
  end)

# IO.inspect(jobs_elements, label: "jobs_elements")

items

Write Items to CSV

alias NimbleCSV.RFC4180, as: CSV

take = fn item, header_keys ->
  Enum.map(header_keys, &Map.get(item, &1))
end

header_keys = [
  :posted_at,
  :title,
  :company,
  :location,
  :tags,
  :source_url,
  :post_url
]

sorted_path = "./data/larajobs-sorted.csv"
File.rm_rf(sorted_path)
io_device = File.open!(sorted_path, [:write, :append, :binary, :utf8])

header = CSV.dump_to_iodata([header_keys])

# See https://www.coletiv.com/blog/elixir-sort-lists-by-date/
# https://elixir-lang.org/blog/2020/01/27/elixir-v1-10-0-released/#improvements-to-sort-based-apis-in-enum
csv =
  items
  |> Enum.sort(&(Date.compare(&1.posted_at, &2.posted_at) != :lt))
  |> Enum.map(&take.(&1, header_keys))
  |> CSV.dump_to_iodata()

:ok = IO.write(io_device, header)
:ok = IO.write(io_device, csv)
:ok = File.close(io_device)

Display the Sorted CSV

data =
  "./data/larajobs-sorted.csv"
  |> File.read!()
  |> CSV.parse_string()

data
|> Enum.with_index()
|> Enum.map(fn {row, index} ->
  %{
    id: index,
    posted_at: Enum.at(row, 0),
    title: Enum.at(row, 1),
    company: Enum.at(row, 2),
    location: Enum.at(row, 3),
    tags: Enum.at(row, 4),
    source_url: Enum.at(row, 5),
    post_url: Enum.at(row, 6)
  }
end)
|> Kino.DataTable.new()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

req--larajobs.livemd

req--larajobs.livemd

larajobs

Summary

Specifications

Code

Get Items from Parsing HTML

Write Items to CSV

Display the Sorted CSV

Files

req--larajobs.livemd

Latest commit

History

req--larajobs.livemd

File metadata and controls

larajobs

Summary

Specifications

Code

Get Items from Parsing HTML

Write Items to CSV

Display the Sorted CSV