Merge pull request #7 from kingdonb/action

Add GitHub Action workflows
kingdonb · Aug 15, 2023 · e45e3ea · e45e3ea
2 parents 2801670 + 469b770
commit e45e3ea
Show file tree

Hide file tree

Showing 15 changed files with 331 additions and 80 deletions.
diff --git a/.github/scripts/check_summary.sh b/.github/scripts/check_summary.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+set -e
+
+LINE_COUNT=$(wc -l < pr-summary.csv)
+
+if [ "$LINE_COUNT" -gt 1 ]; then
+    echo "Issues found in PR. Attaching pr-summary.csv for review..."
+else
+    echo "No direct issues found in PR. Attaching baseline-unresolved.csv for reference..."
+fi
diff --git a/.github/scripts/comment_on_pr.sh b/.github/scripts/comment_on_pr.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -e
+
+LINE_COUNT=$(wc -l < pr-summary.csv)
+
+if [ "$LINE_COUNT" -le 1 ]; then
+    # Using GitHub CLI to comment on the PR.
+    gh pr comment ${{ github.event.pull_request.number }} --body "Warning: Some unresolved baseline issues are present. Please check the attached baseline-unresolved.csv."
+fi
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,29 @@
+name: Test Action Workflow
+
+on:
+  workflow_dispatch:
+    inputs:
+      prNumber:
+        description: 'PR number to test against'
+        required: true
+        default: '1573'
+
+jobs:
+  test-action:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Ruby 3.0
+      uses: ruby/setup-ruby@v1
+      with:
+        ruby-version: 3.0
+        bundler-cache: true
+
+    - name: Run the action
+      uses: ./action/
+      with:
+        token: ${{ secrets.GITHUB_TOKEN }}
+        prNumber: ${{ github.event.inputs.prNumber }}
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
-.PHONY: main clean-cache preview all clean
-all: main clean-cache preview
+.PHONY: main clean-cache preview all clean normalize summary
+all: main clean-cache preview normalize summary
 
 main:
 	ruby ./main.rb
@@ -12,6 +12,24 @@ clean-cache:
 preview:
 	ruby ./main.rb fluxcd.io deploy-preview-1573--fluxcd.netlify.app preview-report.csv false
 
+run_with_preview:
+	@echo "Running with preview URL: $(PREVIEW_URL)"
+	ruby ./main.rb fluxcd.io $(PREVIEW_URL) preview-report.csv false
+
 clean: clean-cache
-	@rm -f report.csv preview-report.csv
+	@rm -f report.csv preview-report.csv pr-summary.csv baseline-unresolved.csv
 	@echo "Clean complete!"
+
+normalize:
+	@# Normalize the main report.csv
+	@gsed -i '1d' report.csv
+	@PREVIEW_DOMAIN=$(if [ -z "$(PREVIEW_URL)" ]; then echo "deploy-preview-1573--fluxcd.netlify.app"; else echo "$(PREVIEW_URL)"; fi)
+	@gsed -i "s/fluxcd.io/$$PREVIEW_DOMAIN/1; s/fluxcd.io/$$PREVIEW_DOMAIN/1" report.csv
+	@sort -o report.csv report.csv
+
+	@# Normalize the preview-report.csv
+	@gsed -i '1d' preview-report.csv
+	@sort -o preview-report.csv preview-report.csv
+
+summary:
+	ruby ./lib/summary.rb
diff --git a/action/action.yml b/action/action.yml
@@ -0,0 +1,57 @@
+name: Link Checker
+
+on: [pull_request]
+
+jobs:
+  check-links:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Ruby 3.0
+      uses: ruby/setup-ruby@v1
+      with:
+        ruby-version: 3.0
+        bundler-cache: true
+
+    - name: Run main target
+      run: make main
+
+    - name: Clean cache
+      run: make clean-cache
+
+    - name: Run with preview
+      run: make run_with_preview
+
+    - name: Normalize reports
+      run: make normalize
+
+    - name: Run summary
+      id: run-summary
+      run: make summary
+      continue-on-error: true
+
+    - name: Check summary results
+      run: ./.github/scripts/check_summary.sh
+      if: steps.run-summary.outcome == 'failure'
+
+    - name: Comment on PR if necessary
+      run: ./.github/scripts/comment_on_pr.sh
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Upload pr-summary.csv
+      uses: actions/upload-artifact@v3
+      if: always() && steps.run-summary.outcome == 'failure'
+      with:
+        name: pr-summary
+        path: pr-summary.csv
+
+    - name: Upload baseline-unresolved.csv
+      uses: actions/upload-artifact@v3
+      if: always() && steps.run-summary.outcome == 'success'
+      with:
+        name: baseline-unresolved
+        path: baseline-unresolved.csv
diff --git a/lib/cache_helper.rb b/lib/cache_helper.rb
@@ -1,13 +1,23 @@
 module CacheHelper
   def self.get_cache_path(url)
     uri = URI(url)
-    cache_path = "cache" + uri.path
-
+    cache_path = File.join("cache", uri.path)
     # If the path doesn't have a common file extension, treat it as a directory.
     unless cache_path.match(/\.(html|xml|json|txt|js|css|jpg|jpeg|png|gif)$/i)
-      cache_path += "/index.html"
+      cache_path = File.join(cache_path, "index.html")
     end
-
     cache_path
   end
+
+  def self.write_to_cache(url, content, status)
+    cache_path = get_cache_path(url)
+    data = { content: content, status: status }
+    File.write(cache_path, JSON.dump(data))
+  end
+
+  def self.read_from_cache(url)
+    cache_path = get_cache_path(url)
+    data = JSON.parse(File.read(cache_path))
+    [data["content"], data["status"]]
+  end
 end
diff --git a/lib/link.rb b/lib/link.rb
@@ -1,3 +1,6 @@
+require './lib/cache_helper'
+require './lib/url_helper'
+
 class Link
   attr_accessor :source_file, :target, :type, :anchor,
                 :response_status, :link_string, :link_text, :line_no, :reference_intact
@@ -8,15 +11,24 @@ def initialize(source_url, link_element, domain)
 
     if link_element
       @link_string = link_element['href']
-      @link_text = link_element.text.strip
+      link_text = link_element.text
       @line_no = link_element.line
       determine_type
       extract_anchor
+    else
+      # If no link_element is provided, assume the source is the target and type is local.
+      @link_string = source_url
+      @target = source_url
+      @type = 'local'
     end
 
     make_absolute
   end
 
+  def link_text=(value)
+    @link_text = value.strip.gsub(/\s+/, ' ')
+  end
+
   def to_h
     {
       source_file: @source_file,
@@ -46,13 +58,18 @@ def self.from_h(hash)
   end
 
   def download_and_store
-    cache_path = get_cache_path
-    unless File.exist?(cache_path)
-      html_content = Net::HTTP.get(URI(@source_file))
-      FileUtils.mkdir_p(File.dirname(cache_path))
-      File.write(cache_path, html_content)
+    cache_path = CacheHelper.get_cache_path(@source_file)
+    if File.exist?(cache_path)
+      html_content, status = CacheHelper.read_from_cache(@source_file)
+      @response_status = status if status && status.to_i >= 400
     else
-      html_content = File.read(cache_path)
+      response = Net::HTTP.get_response(URI(@source_file))
+      html_content = response.body
+      html_content.force_encoding('UTF-8')
+      # Ensure the directory exists before writing the cache
+      FileUtils.mkdir_p(File.dirname(cache_path))
+      CacheHelper.write_to_cache(@source_file, html_content, response.code)
+      @response_status = response.code if response.code.to_i >= 400
     end
 
     Nokogiri::HTML(html_content)
@@ -66,6 +83,14 @@ def reference_intact?
     @reference_intact
   end
 
+  def set_error(error_message)
+    @error = error_message
+  end
+
+  def has_error?
+    !@error.nil?
+  end
+
   private
 
   def determine_type
@@ -77,27 +102,12 @@ def determine_type
   end
 
   def extract_anchor
-    @anchor = URI(@link_string).fragment
-  rescue URI::InvalidURIError
-    @anchor = URI(URI::Parser.new.escape(@link_string)).fragment
+    @anchor = URLHelper.extract_fragment(@link_string)
   end
 
   def make_absolute
     return unless @link_string
-    @target = URI.join(@source_file, @link_string).to_s
-  rescue URI::InvalidURIError
-    @target = URI.join(@source_file, URI::Parser.new.escape(@link_string)).to_s
-    nil
-  end
-
-  def get_cache_path
-    uri = URI(@source_file)
-    cache_path = "cache" + uri.path
-    # If the path doesn't have a common file extension, treat it as a directory.
-    unless cache_path.match(/\.(html|xml|json|txt|js|css|jpg|jpeg|png|gif)$/i)
-      cache_path += "/index.html"
-    end
-    cache_path
+    @target = URLHelper.make_absolute(@source_file, @link_string)
   end
 end
 

diff --git a/lib/link/link_analyzer.rb b/lib/link/link_analyzer.rb
@@ -9,41 +9,48 @@ def initialize(domain, masquerade_domain)
   end
 
   def analyze_links(sitemap_urls)
-    links_data = []
+    links_data = {}
     threads = []
 
     sitemap_urls.each_slice(SLICE_SIZE) do |slice|
       threads << Thread.new do
         slice.each do |url|
+          link = ensure_link(links_data, url, nil)
           begin
             url = masquerade_url(url) if @masquerade_domain
-            puts "Visiting: #{url}"
-            doc = Link.new(url, nil, @domain).download_and_store
+            base_url, fragment = url.split('#', 2)
+            fragment = URI::Parser.new.escape(fragment) if fragment
+            full_url = fragment ? "#{base_url}##{fragment}" : base_url
+
+            puts "Visiting: #{full_url}"
+            doc = link.download_and_store
 
             # Extracting all the links from the page
             doc.css('a').each do |link_element|
               link_href = link_element['href']
               # Skip links without href or with href set to '#'
               next if link_href.nil? || link_href.strip == '#'
 
-              begin
-              link = Link.new(url, link_element, @domain)
-              rescue URI::InvalidURIError => e
-                PRY_MUTEX.synchronize{binding.pry}
-              end
-              LINKS_MUTEX.synchronize do
-                links_data << link
-              end
+              # Splitting the base URL and fragment for proper handling
+              base_url, fragment = link_href.split('#', 2)
+              fragment = URI::Parser.new.escape(fragment) if fragment
+
+              # Combine the base URL with the original URL and append the fragment if present
+              joined_url = URI.join(url, base_url).to_s
+              target_url = fragment ? "#{joined_url}##{fragment}" : joined_url
+
+              link = ensure_link(links_data, target_url, link_element)
             end
           rescue StandardError => e
+            link.response_status = "Error: #{e.message}"
             puts "Error downloading or analyzing URL #{url}: #{e.message}"
           end
         end
       end
     end
 
     threads.each(&:join)
-    links_data
+    links_data.values
   end
 
   private
@@ -57,4 +64,13 @@ def masquerade_url(url)
       url
     end
   end
+
+  def ensure_link(links_data, url, link_element)
+    LINKS_MUTEX.synchronize do
+      unless links_data[url]
+        links_data[url] = Link.new(url, link_element, @domain)
+      end
+      links_data[url]
+    end
+  end
 end
diff --git a/lib/link_checker.rb b/lib/link_checker.rb
@@ -27,17 +27,19 @@ def fetch_sitemap
     fetcher = SitemapFetcher.new(@domain, @masquerade_domain)
     @sitemap_urls = fetcher.fetch_sitemap_urls
     puts "Fetched sitemap with #{@sitemap_urls.size} URLs."
-  rescue => e
+  rescue StandardError => e
     puts "Error fetching sitemap: #{e.message}"
     exit
   end
 
   def download_and_analyze_links
     if File.exist?(LINKS_DATA_FILE)
+      # Loading from cache: Parse JSON data into Link objects
       links_data_hashes = JSON.parse(File.read(LINKS_DATA_FILE), symbolize_names: true)
       @links_data = links_data_hashes.map { |hash| Link.from_h(hash) }
       puts "Loaded links data from cache."
     else
+      # Fetching fresh data: Use LinkAnalyzer to get Link objects and cache for future use
       analyzer = LinkAnalyzer.new(@domain, @masquerade_domain)
       @links_data = analyzer.analyze_links(@sitemap_urls)
 
@@ -46,15 +48,15 @@ def download_and_analyze_links
 
       puts "Links data saved to cache."
     end
-  rescue => e
+  rescue StandardError => e
     puts "Error downloading and analyzing links: #{e.message}"
     exit
   end
 
   def validate_links
     validator = LinkValidator.new(@links_data, @domain, @masquerade_domain)
     @links_data = validator.validate_links
-  rescue => e
+  rescue StandardError => e
     # PRY_MUTEX.synchronize{binding.pry}
     puts "Error validating links: #{e.message}"
     exit
@@ -64,7 +66,7 @@ def generate_report
     generator = ReportGenerator.new(@links_data, @report_file)
     generator.generate
     puts "Report generated at #{@report_file}."
-  rescue => e
+  rescue StandardError => e
     puts "Error generating report: #{e.message}"
   end
 end