From 5c99be987224872a79c57ae0114fe66fbd962711 Mon Sep 17 00:00:00 2001
From: Kingdon Barrett <kingdon@weave.works>
Date: Mon, 14 Aug 2023 18:21:57 -0400
Subject: [PATCH 01/12] add some specificity, refining

ChatGPT has identified download_and_analyze_links as a likely culprit
We'll now delve into the files that make it up and see what we can fix

Signed-off-by: Kingdon Barrett <kingdon@weave.works>
---
 lib/link_checker.rb | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/lib/link_checker.rb b/lib/link_checker.rb
index f515e42..961f615 100644
--- a/lib/link_checker.rb
+++ b/lib/link_checker.rb
@@ -27,17 +27,19 @@ def fetch_sitemap
     fetcher = SitemapFetcher.new(@domain, @masquerade_domain)
     @sitemap_urls = fetcher.fetch_sitemap_urls
     puts "Fetched sitemap with #{@sitemap_urls.size} URLs."
-  rescue => e
+  rescue StandardError => e
     puts "Error fetching sitemap: #{e.message}"
     exit
   end
 
   def download_and_analyze_links
     if File.exist?(LINKS_DATA_FILE)
+      # Loading from cache: Parse JSON data into Link objects
       links_data_hashes = JSON.parse(File.read(LINKS_DATA_FILE), symbolize_names: true)
       @links_data = links_data_hashes.map { |hash| Link.from_h(hash) }
       puts "Loaded links data from cache."
     else
+      # Fetching fresh data: Use LinkAnalyzer to get Link objects and cache for future use
       analyzer = LinkAnalyzer.new(@domain, @masquerade_domain)
       @links_data = analyzer.analyze_links(@sitemap_urls)
 
@@ -46,7 +48,7 @@ def download_and_analyze_links
 
       puts "Links data saved to cache."
     end
-  rescue => e
+  rescue StandardError => e
     puts "Error downloading and analyzing links: #{e.message}"
     exit
   end
@@ -54,7 +56,7 @@ def download_and_analyze_links
   def validate_links
     validator = LinkValidator.new(@links_data, @domain, @masquerade_domain)
     @links_data = validator.validate_links
-  rescue => e
+  rescue StandardError => e
     # PRY_MUTEX.synchronize{binding.pry}
     puts "Error validating links: #{e.message}"
     exit
@@ -64,7 +66,7 @@ def generate_report
     generator = ReportGenerator.new(@links_data, @report_file)
     generator.generate
     puts "Report generated at #{@report_file}."
-  rescue => e
+  rescue StandardError => e
     puts "Error generating report: #{e.message}"
   end
 end

From b5c42abc133c1c24f2e4c1337af02c7514738c27 Mon Sep 17 00:00:00 2001
From: Kingdon Barrett <kingdon@weave.works>
Date: Mon, 14 Aug 2023 19:56:50 -0400
Subject: [PATCH 02/12] make use of helpers

Signed-off-by: Kingdon Barrett <kingdon@weave.works>
---
 lib/link.rb                            | 32 +++++++++++---------------
 lib/link/link_analyzer.rb              | 27 +++++++++++++---------
 lib/url_helper.rb                      | 21 +++++++++++++++++
 lib/validator/remote_link_validator.rb |  6 ++---
 4 files changed, 54 insertions(+), 32 deletions(-)
 create mode 100644 lib/url_helper.rb

diff --git a/lib/link.rb b/lib/link.rb
index 4b3cb34..eb18584 100644
--- a/lib/link.rb
+++ b/lib/link.rb
@@ -1,3 +1,6 @@
+require './lib/cache_helper'
+require './lib/url_helper'
+
 class Link
   attr_accessor :source_file, :target, :type, :anchor,
                 :response_status, :link_string, :link_text, :line_no, :reference_intact
@@ -46,7 +49,7 @@ def self.from_h(hash)
   end
 
   def download_and_store
-    cache_path = get_cache_path
+    cache_path = CacheHelper.get_cache_path(@source_file)
     unless File.exist?(cache_path)
       html_content = Net::HTTP.get(URI(@source_file))
       FileUtils.mkdir_p(File.dirname(cache_path))
@@ -66,6 +69,14 @@ def reference_intact?
     @reference_intact
   end
 
+  def set_error(error_message)
+    @error = error_message
+  end
+
+  def has_error?
+    !@error.nil?
+  end
+
   private
 
   def determine_type
@@ -77,27 +88,12 @@ def determine_type
   end
 
   def extract_anchor
-    @anchor = URI(@link_string).fragment
-  rescue URI::InvalidURIError
-    @anchor = URI(URI::Parser.new.escape(@link_string)).fragment
+    @anchor = URLHelper.extract_fragment(@link_string)
   end
 
   def make_absolute
     return unless @link_string
-    @target = URI.join(@source_file, @link_string).to_s
-  rescue URI::InvalidURIError
-    @target = URI.join(@source_file, URI::Parser.new.escape(@link_string)).to_s
-    nil
-  end
-
-  def get_cache_path
-    uri = URI(@source_file)
-    cache_path = "cache" + uri.path
-    # If the path doesn't have a common file extension, treat it as a directory.
-    unless cache_path.match(/\.(html|xml|json|txt|js|css|jpg|jpeg|png|gif)$/i)
-      cache_path += "/index.html"
-    end
-    cache_path
+    @target = URLHelper.make_absolute(@source_file, @link_string)
   end
 end
 
diff --git a/lib/link/link_analyzer.rb b/lib/link/link_analyzer.rb
index 30074e3..9d0d0ff 100644
--- a/lib/link/link_analyzer.rb
+++ b/lib/link/link_analyzer.rb
@@ -9,16 +9,17 @@ def initialize(domain, masquerade_domain)
   end
 
   def analyze_links(sitemap_urls)
-    links_data = []
+    links_data = {}
     threads = []
 
     sitemap_urls.each_slice(SLICE_SIZE) do |slice|
       threads << Thread.new do
         slice.each do |url|
+          link = ensure_link(links_data, url, nil)
           begin
             url = masquerade_url(url) if @masquerade_domain
             puts "Visiting: #{url}"
-            doc = Link.new(url, nil, @domain).download_and_store
+            doc = link.download_and_store
 
             # Extracting all the links from the page
             doc.css('a').each do |link_element|
@@ -26,16 +27,11 @@ def analyze_links(sitemap_urls)
               # Skip links without href or with href set to '#'
               next if link_href.nil? || link_href.strip == '#'
 
-              begin
-              link = Link.new(url, link_element, @domain)
-              rescue URI::InvalidURIError => e
-                PRY_MUTEX.synchronize{binding.pry}
-              end
-              LINKS_MUTEX.synchronize do
-                links_data << link
-              end
+              target_url = URI.join(url, link_href).to_s
+              link = ensure_link(links_data, target_url, link_element)
             end
           rescue StandardError => e
+            link.response_status = "Error: #{e.message}"
             puts "Error downloading or analyzing URL #{url}: #{e.message}"
           end
         end
@@ -43,7 +39,7 @@ def analyze_links(sitemap_urls)
     end
 
     threads.each(&:join)
-    links_data
+    links_data.values
   end
 
   private
@@ -57,4 +53,13 @@ def masquerade_url(url)
       url
     end
   end
+
+  def ensure_link(links_data, url, link_element)
+    LINKS_MUTEX.synchronize do
+      unless links_data[url]
+        links_data[url] = Link.new(url, link_element, @domain)
+      end
+      links_data[url]
+    end
+  end
 end
diff --git a/lib/url_helper.rb b/lib/url_helper.rb
new file mode 100644
index 0000000..b0fd03e
--- /dev/null
+++ b/lib/url_helper.rb
@@ -0,0 +1,21 @@
+module URLHelper
+  def self.make_absolute(base_url, relative_url)
+    return relative_url if relative_url.nil?
+
+    begin
+      URI.join(base_url, relative_url).to_s
+    rescue URI::InvalidURIError
+      URI.join(base_url, URI::Parser.new.escape(relative_url)).to_s
+    end
+  end
+
+  def self.extract_fragment(url)
+    return nil unless url
+
+    begin
+      URI(url).fragment
+    rescue URI::InvalidURIError
+      URI(URI::Parser.new.escape(url)).fragment
+    end
+  end
+end
diff --git a/lib/validator/remote_link_validator.rb b/lib/validator/remote_link_validator.rb
index 4daa25c..4a77ac3 100644
--- a/lib/validator/remote_link_validator.rb
+++ b/lib/validator/remote_link_validator.rb
@@ -15,13 +15,13 @@ def validate
       retries += 1
       retry if retries < MAX_RETRIES
       puts "Error after #{MAX_RETRIES} retries for link #{link.target}: #{e.message}"
-      link.response_status = "Timeout"
+      link.set_error "Timeout"
     rescue SocketError => e
       puts "Network error for link #{link.target}: #{e.message}"
-      link.response_status = "Network Error"
+      link.set_error "Network Error"
     rescue StandardError => e
       puts "Unexpected error for link #{link.target}: #{e.message}"
-      link.response_status = "Error"
+      link.set_error "Error (#{e.message})"
     end
   end
 end

From 94ec62996d119feb50327e040e1c6e1eab0585d1 Mon Sep 17 00:00:00 2001
From: Kingdon Barrett <kingdon@weave.works>
Date: Mon, 14 Aug 2023 20:08:59 -0400
Subject: [PATCH 03/12] try to implement chatgpt's refinements

Signed-off-by: Kingdon Barrett <kingdon@weave.works>
---
 lib/validator/link_validator.rb | 54 +++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/lib/validator/link_validator.rb b/lib/validator/link_validator.rb
index 3fec91e..f57c40c 100644
--- a/lib/validator/link_validator.rb
+++ b/lib/validator/link_validator.rb
@@ -5,41 +5,49 @@ class LinkValidator
   MAX_THREADS = 4
 
   def initialize(links_data, domain, masquerade_domain, process_remote_links = false)
-    @links_data = links_data.map do |link|
-      link.dup.tap do |ld|
-        ld.target = ld.target.gsub(domain, masquerade_domain) if ld.type != 'remote'
-      end
-    end
+    @links_data = links_data
+    adjust_links_target(domain, masquerade_domain)
+
     @parsed_docs_cache = {}
+    @parsed_docs_cache_mutex = Mutex.new
     @domain = domain
     @masquerade_domain = masquerade_domain
     @process_remote_links = process_remote_links
   end
 
   def validate_links
-    # Separate remote links for parallel processing
-    remote_links = @links_data.select { |link| link.type == 'remote' }
-    local_links = @links_data.reject { |link| link.type == 'remote' }
+    handle_local_links
+    handle_remote_links if @process_remote_links
+    @links_data
+  end
 
-    # Handle local links
-    local_links.each do |link|
-      next if link.target =~ /^mailto:/
-      LocalLinkValidator.new(link, @parsed_docs_cache).validate
+  private
+
+  def adjust_links_target(domain, masquerade_domain)
+    @links_data.each do |link|
+      link.target.gsub!(domain, masquerade_domain) if link.type != 'remote'
     end
+  end
 
-    if @process_remote_links
-      # Parallel processing for remote links
-      thread_pool = []
-      remote_links.each_slice(remote_links.size / MAX_THREADS + 1) do |link_slice|
-        thread_pool << Thread.new do
-          link_slice.each do |link|
-            RemoteLinkValidator.new(link).validate
-          end
+  def handle_local_links
+    @links_data.each do |link|
+      next if link.type == 'remote' || link.target =~ /^mailto:/
+
+      validator = LocalLinkValidator.new(link, @parsed_docs_cache, @parsed_docs_cache_mutex)
+      validator.validate
+    end
+  end
+
+  def handle_remote_links
+    thread_pool = []
+    remote_links = @links_data.select { |link| link.type == 'remote' }
+    remote_links.each_slice(remote_links.size / MAX_THREADS + 1) do |link_slice|
+      thread_pool << Thread.new do
+        link_slice.each do |link|
+          RemoteLinkValidator.new(link).validate
         end
       end
-      thread_pool.each(&:join)
     end
-
-    @links_data
+    thread_pool.each(&:join)
   end
 end

From b3aec77187d2e91a8c20a560fa6421391b17493b Mon Sep 17 00:00:00 2001
From: Kingdon Barrett <kingdon@weave.works>
Date: Mon, 14 Aug 2023 20:43:25 -0400
Subject: [PATCH 04/12] fix some bugs according to gpt

Signed-off-by: Kingdon Barrett <kingdon@weave.works>
---
 lib/cache_helper.rb                   | 18 ++++++++++++++----
 lib/link.rb                           | 19 ++++++++++++++-----
 lib/validator/base_link_validator.rb  |  5 +++--
 lib/validator/local_link_validator.rb | 14 +++++++++-----
 4 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/lib/cache_helper.rb b/lib/cache_helper.rb
index 442bb57..7c8e20c 100644
--- a/lib/cache_helper.rb
+++ b/lib/cache_helper.rb
@@ -1,13 +1,23 @@
 module CacheHelper
   def self.get_cache_path(url)
     uri = URI(url)
-    cache_path = "cache" + uri.path
-
+    cache_path = File.join("cache", uri.path)
     # If the path doesn't have a common file extension, treat it as a directory.
     unless cache_path.match(/\.(html|xml|json|txt|js|css|jpg|jpeg|png|gif)$/i)
-      cache_path += "/index.html"
+      cache_path = File.join(cache_path, "index.html")
     end
-
     cache_path
   end
+
+  def self.write_to_cache(url, content, status)
+    cache_path = get_cache_path(url)
+    data = { content: content, status: status }
+    File.write(cache_path, JSON.dump(data))
+  end
+
+  def self.read_from_cache(url)
+    cache_path = get_cache_path(url)
+    data = JSON.parse(File.read(cache_path))
+    [data["content"], data["status"]]
+  end
 end
diff --git a/lib/link.rb b/lib/link.rb
index eb18584..3ee8bf5 100644
--- a/lib/link.rb
+++ b/lib/link.rb
@@ -15,6 +15,11 @@ def initialize(source_url, link_element, domain)
       @line_no = link_element.line
       determine_type
       extract_anchor
+    else
+      # If no link_element is provided, assume the source is the target and type is local.
+      @link_string = source_url
+      @target = source_url
+      @type = 'local'
     end
 
     make_absolute
@@ -50,12 +55,16 @@ def self.from_h(hash)
 
   def download_and_store
     cache_path = CacheHelper.get_cache_path(@source_file)
-    unless File.exist?(cache_path)
-      html_content = Net::HTTP.get(URI(@source_file))
-      FileUtils.mkdir_p(File.dirname(cache_path))
-      File.write(cache_path, html_content)
+    if File.exist?(cache_path)
+      html_content, status = CacheHelper.read_from_cache(@source_file)
+      @response_status = status if status && status.to_i >= 400
     else
-      html_content = File.read(cache_path)
+      response = Net::HTTP.get_response(URI(@source_file))
+      html_content = response.body
+      # Ensure the directory exists before writing the cache
+      FileUtils.mkdir_p(File.dirname(cache_path))
+      CacheHelper.write_to_cache(@source_file, html_content, response.code)
+      @response_status = response.code if response.code.to_i >= 400
     end
 
     Nokogiri::HTML(html_content)
diff --git a/lib/validator/base_link_validator.rb b/lib/validator/base_link_validator.rb
index 7043d26..a8caf39 100644
--- a/lib/validator/base_link_validator.rb
+++ b/lib/validator/base_link_validator.rb
@@ -1,9 +1,10 @@
 class BaseLinkValidator
-  attr_reader :link, :parsed_docs_cache
+  attr_reader :link, :parsed_docs_cache, :links_mutex
 
-  def initialize(link, parsed_docs_cache = {})
+  def initialize(link, parsed_docs_cache = {}, links_mutex = Mutex.new)
     @link = link
     @parsed_docs_cache = parsed_docs_cache
+    @links_mutex = links_mutex
   end
 
   def valid_anchor?
diff --git a/lib/validator/local_link_validator.rb b/lib/validator/local_link_validator.rb
index 1ad17fa..804887d 100644
--- a/lib/validator/local_link_validator.rb
+++ b/lib/validator/local_link_validator.rb
@@ -3,18 +3,22 @@
 
 class LocalLinkValidator < BaseLinkValidator
   def validate
+    return if @link.response_status && @link.response_status.to_i >= 400
+
     normalized_url = URI(@link.target).normalize.to_s
     cache_path = CacheHelper.get_cache_path(normalized_url)
 
     return @link.response_status = "Not Cached" unless File.exist?(cache_path)
 
-    unless @parsed_docs_cache[normalized_url]
-      html_content = File.read(cache_path)
-      @parsed_docs_cache[normalized_url] = Nokogiri::HTML(html_content)
+    doc = nil
+    @links_mutex.synchronize do
+      unless @parsed_docs_cache[normalized_url]
+        html_content = File.read(cache_path)
+        @parsed_docs_cache[normalized_url] = Nokogiri::HTML(html_content)
+      end
+      doc = @parsed_docs_cache[normalized_url]
     end
 
-    doc = @parsed_docs_cache[normalized_url]
-
     if valid_anchor?
       escaped = escaped_anchor
       @link.check_reference_intact!(escaped_anchor, doc)

From 0718c52df2186d61574643994dd9ca2b8af8345b Mon Sep 17 00:00:00 2001
From: Kingdon Barrett <kingdon@weave.works>
Date: Mon, 14 Aug 2023 21:00:26 -0400
Subject: [PATCH 05/12] fix encoding issues to parse all the links

This change resolves all outstanding crashes and reduces the final
output to something less than we were emitting before. (Did the links
that dropped out of the report incorrectly represent an error?)

Signed-off-by: Kingdon Barrett <kingdon@weave.works>
---
 lib/link.rb               |  1 +
 lib/link/link_analyzer.rb | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/lib/link.rb b/lib/link.rb
index 3ee8bf5..9e51ade 100644
--- a/lib/link.rb
+++ b/lib/link.rb
@@ -61,6 +61,7 @@ def download_and_store
     else
       response = Net::HTTP.get_response(URI(@source_file))
       html_content = response.body
+      html_content.force_encoding('UTF-8')
       # Ensure the directory exists before writing the cache
       FileUtils.mkdir_p(File.dirname(cache_path))
       CacheHelper.write_to_cache(@source_file, html_content, response.code)
diff --git a/lib/link/link_analyzer.rb b/lib/link/link_analyzer.rb
index 9d0d0ff..2712426 100644
--- a/lib/link/link_analyzer.rb
+++ b/lib/link/link_analyzer.rb
@@ -18,7 +18,11 @@ def analyze_links(sitemap_urls)
           link = ensure_link(links_data, url, nil)
           begin
             url = masquerade_url(url) if @masquerade_domain
-            puts "Visiting: #{url}"
+            base_url, fragment = url.split('#', 2)
+            fragment = URI::Parser.new.escape(fragment) if fragment
+            full_url = fragment ? "#{base_url}##{fragment}" : base_url
+
+            puts "Visiting: #{full_url}"
             doc = link.download_and_store
 
             # Extracting all the links from the page
@@ -27,7 +31,14 @@ def analyze_links(sitemap_urls)
               # Skip links without href or with href set to '#'
               next if link_href.nil? || link_href.strip == '#'
 
-              target_url = URI.join(url, link_href).to_s
+              # Splitting the base URL and fragment for proper handling
+              base_url, fragment = link_href.split('#', 2)
+              fragment = URI::Parser.new.escape(fragment) if fragment
+
+              # Combine the base URL with the original URL and append the fragment if present
+              joined_url = URI.join(url, base_url).to_s
+              target_url = fragment ? "#{joined_url}##{fragment}" : joined_url
+
               link = ensure_link(links_data, target_url, link_element)
             end
           rescue StandardError => e

From 3fe4b4741238db02f572fa0256c36386b5b3443f Mon Sep 17 00:00:00 2001
From: Kingdon Barrett <kingdon@weave.works>
Date: Mon, 14 Aug 2023 22:13:06 -0400
Subject: [PATCH 06/12] use link_text setter

Signed-off-by: Kingdon Barrett <kingdon@weave.works>
---
 lib/link.rb | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/link.rb b/lib/link.rb
index 9e51ade..9008c4e 100644
--- a/lib/link.rb
+++ b/lib/link.rb
@@ -11,7 +11,7 @@ def initialize(source_url, link_element, domain)
 
     if link_element
       @link_string = link_element['href']
-      @link_text = link_element.text.strip
+      link_text = link_element.text
       @line_no = link_element.line
       determine_type
       extract_anchor
@@ -25,6 +25,10 @@ def initialize(source_url, link_element, domain)
     make_absolute
   end
 
+  def link_text=(value)
+    @link_text = value.strip.gsub(/\s+/, ' ')
+  end
+
   def to_h
     {
       source_file: @source_file,

From 3205ff9f5e24d8b606bdaec9a99a2fdf6b8899ab Mon Sep 17 00:00:00 2001
From: Kingdon Barrett <kingdon@weave.works>
Date: Mon, 14 Aug 2023 22:28:08 -0400
Subject: [PATCH 07/12] add a normalize target in the makefile

Signed-off-by: Kingdon Barrett <kingdon@weave.works>
---
 Makefile | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 760158d..b531c1a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: main clean-cache preview all clean
+.PHONY: main clean-cache preview all clean normalize
 all: main clean-cache preview
 
 main:
@@ -15,3 +15,13 @@ preview:
 clean: clean-cache
 	@rm -f report.csv preview-report.csv
 	@echo "Clean complete!"
+
+normalize:
+	@# Normalize the main report.csv
+	@gsed -i '1d' report.csv
+	@awk 'NR==1{print $0; next} {print $0 | "sort"}' report.csv > tmp.csv && mv tmp.csv report.csv
+	@gsed -i 's/fluxcd.io/deploy-preview-1573--fluxcd.netlify.app/1; s/fluxcd.io/deploy-preview-1573--fluxcd.netlify.app/1' report.csv
+	
+	@# Normalize the preview-report.csv
+	@gsed -i '1d' preview-report.csv
+	@awk 'NR==1{print $0; next} {print $0 | "sort"}' preview-report.csv > tmp.csv && mv tmp.csv preview-report.csv

From 5fb238b3d52c0e8b0372fbbd50328d5269b10ad3 Mon Sep 17 00:00:00 2001
From: Kingdon Barrett <kingdon@weave.works>
Date: Mon, 14 Aug 2023 22:31:47 -0400
Subject: [PATCH 08/12] introducing summary.rb

Signed-off-by: Kingdon Barrett <kingdon@weave.works>
---
 Makefile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index b531c1a..2d8e474 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
-.PHONY: main clean-cache preview all clean normalize
-all: main clean-cache preview
+.PHONY: main clean-cache preview all clean normalize summary
+all: main clean-cache preview normalize summary
 
 main:
 	ruby ./main.rb
@@ -25,3 +25,6 @@ normalize:
 	@# Normalize the preview-report.csv
 	@gsed -i '1d' preview-report.csv
 	@awk 'NR==1{print $0; next} {print $0 | "sort"}' preview-report.csv > tmp.csv && mv tmp.csv preview-report.csv
+
+summary:
+	ruby ./lib/summary.rb

From c108da71a441466582f966aa75e8f10f53a31649 Mon Sep 17 00:00:00 2001
From: Kingdon Barrett <kingdon@weave.works>
Date: Mon, 14 Aug 2023 22:31:54 -0400
Subject: [PATCH 09/12] summary reporting for CI

Signed-off-by: Kingdon Barrett <kingdon@weave.works>
---
 lib/summary.rb | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 lib/summary.rb

diff --git a/lib/summary.rb b/lib/summary.rb
new file mode 100644
index 0000000..e5dd4a2
--- /dev/null
+++ b/lib/summary.rb
@@ -0,0 +1,24 @@
+main_report = File.readlines('report.csv').map(&:strip)
+preview_report = File.readlines('preview-report.csv').map(&:strip)
+
+# Find the differences between the two reports
+resolved_issues = main_report - preview_report
+new_issues = preview_report - main_report
+
+puts "Summary:"
+puts "--------"
+
+puts "Total issues in main site: #{main_report.count}"
+puts "Total issues in preview site: #{preview_report.count}"
+
+puts "\nResolved issues: #{resolved_issues.count}"
+puts "New issues: #{new_issues.count}"
+
+# Check if there are any new issues
+if new_issues.count > 0
+  puts "\nFail: The preview site has introduced new issues!"
+  exit(1)
+else
+  puts "\nPass: No new issues introduced in the preview site."
+  exit(0)
+end

From 1d264ead6d2f24a03a526691a76a0868785d9a62 Mon Sep 17 00:00:00 2001
From: Kingdon Barrett <kingdon@weave.works>
Date: Mon, 14 Aug 2023 23:06:20 -0400
Subject: [PATCH 10/12] Emit summary reports when summary.rb is called

The interesting report is pr-summary.csv because it ostensibly tells
what wrong things were caused by this PR, such that we cannot merge it
without fixing them.

May also be interested in baseline-unresolved.csv so we can start to
tackle some of the issues that are in the deployed website.

Signed-off-by: Kingdon Barrett <kingdon@weave.works>
---
 Makefile       |  2 +-
 lib/summary.rb | 34 +++++++++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 2d8e474..f9610e6 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,7 @@ preview:
 	ruby ./main.rb fluxcd.io deploy-preview-1573--fluxcd.netlify.app preview-report.csv false
 
 clean: clean-cache
-	@rm -f report.csv preview-report.csv
+	@rm -f report.csv preview-report.csv pr-summary.csv baseline-unresolved.csv
 	@echo "Clean complete!"
 
 normalize:
diff --git a/lib/summary.rb b/lib/summary.rb
index e5dd4a2..384b11a 100644
--- a/lib/summary.rb
+++ b/lib/summary.rb
@@ -1,3 +1,7 @@
+require 'csv'
+
+HEADER = ["Link Source", "Link Target", "Type", "Anchor?", "Reference Intact?", "Response Status", "Link String", "Link Text", "Line No."]
+
 main_report = File.readlines('report.csv').map(&:strip)
 preview_report = File.readlines('preview-report.csv').map(&:strip)
 
@@ -5,6 +9,24 @@
 resolved_issues = main_report - preview_report
 new_issues = preview_report - main_report
 
+unresolved_issues = main_report & preview_report
+
+# Write to the pr-summary.csv
+CSV.open('pr-summary.csv', 'wb') do |csv|
+  csv << HEADER
+  new_issues.each do |issue|
+    csv << issue.split(',')
+  end
+end
+
+# Write to the baseline-unresolved.csv
+CSV.open('baseline-unresolved.csv', 'wb') do |csv|
+  csv << HEADER
+  unresolved_issues.each do |issue|
+    csv << issue.split(',')
+  end
+end
+
 puts "Summary:"
 puts "--------"
 
@@ -14,9 +36,19 @@
 puts "\nResolved issues: #{resolved_issues.count}"
 puts "New issues: #{new_issues.count}"
 
-# Check if there are any new issues
+# Check if there are any new issues and show top 3 problematic links
 if new_issues.count > 0
   puts "\nFail: The preview site has introduced new issues!"
+  puts "\nTop 3 problematic links introduced in the PR:"
+
+  new_issues.first(3).each do |issue|
+    data = issue.split(',')
+    puts "Link: #{data[1]}"
+    puts "Found on: #{data[0]}"
+    puts "---------"
+  end
+
+  puts "Please check pr-summary.csv for the full list of new issues."
   exit(1)
 else
   puts "\nPass: No new issues introduced in the preview site."

From 4bfa1c899b4f8a8b41930213e06cfb1fed72aaa3 Mon Sep 17 00:00:00 2001
From: Kingdon Barrett <kingdon@weave.works>
Date: Mon, 14 Aug 2023 23:50:57 -0400
Subject: [PATCH 11/12] the action according to gpt

Signed-off-by: Kingdon Barrett <kingdon@weave.works>
---
 Makefile          | 11 ++++++---
 action/action.yml | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 3 deletions(-)
 create mode 100644 action/action.yml

diff --git a/Makefile b/Makefile
index f9610e6..8c081e7 100644
--- a/Makefile
+++ b/Makefile
@@ -12,6 +12,10 @@ clean-cache:
 preview:
 	ruby ./main.rb fluxcd.io deploy-preview-1573--fluxcd.netlify.app preview-report.csv false
 
+run_with_preview:
+	@echo "Running with preview URL: $(PREVIEW_URL)"
+	ruby ./main.rb fluxcd.io $(PREVIEW_URL) preview-report.csv false
+
 clean: clean-cache
 	@rm -f report.csv preview-report.csv pr-summary.csv baseline-unresolved.csv
 	@echo "Clean complete!"
@@ -19,12 +23,13 @@ clean: clean-cache
 normalize:
 	@# Normalize the main report.csv
 	@gsed -i '1d' report.csv
-	@awk 'NR==1{print $0; next} {print $0 | "sort"}' report.csv > tmp.csv && mv tmp.csv report.csv
-	@gsed -i 's/fluxcd.io/deploy-preview-1573--fluxcd.netlify.app/1; s/fluxcd.io/deploy-preview-1573--fluxcd.netlify.app/1' report.csv
+	@PREVIEW_DOMAIN=$(if [ -z "$(PREVIEW_URL)" ]; then echo "deploy-preview-1573--fluxcd.netlify.app"; else echo "$(PREVIEW_URL)"; fi)
+	@gsed -i "s/fluxcd.io/$$PREVIEW_DOMAIN/1; s/fluxcd.io/$$PREVIEW_DOMAIN/1" report.csv
+	@sort -o report.csv report.csv
 	
 	@# Normalize the preview-report.csv
 	@gsed -i '1d' preview-report.csv
-	@awk 'NR==1{print $0; next} {print $0 | "sort"}' preview-report.csv > tmp.csv && mv tmp.csv preview-report.csv
+	@sort -o preview-report.csv preview-report.csv
 
 summary:
 	ruby ./lib/summary.rb
diff --git a/action/action.yml b/action/action.yml
new file mode 100644
index 0000000..4f98dff
--- /dev/null
+++ b/action/action.yml
@@ -0,0 +1,57 @@
+name: Link Checker
+
+on: [pull_request]
+
+jobs:
+  check-links:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Ruby 3.0
+      uses: ruby/setup-ruby@v1
+      with:
+        ruby-version: 3.0
+        bundler-cache: true
+
+    - name: Run main target
+      run: make main
+
+    - name: Clean cache
+      run: make clean-cache
+
+    - name: Run with preview
+      run: make run_with_preview
+
+    - name: Normalize reports
+      run: make normalize
+
+    - name: Run summary
+      id: run-summary
+      run: make summary
+      continue-on-error: true
+
+    - name: Check summary results
+      run: ./.github/scripts/check_summary.sh
+      if: steps.run-summary.outcome == 'failure'
+
+    - name: Comment on PR if necessary
+      run: ./.github/scripts/comment_on_pr.sh
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Upload pr-summary.csv
+      uses: actions/upload-artifact@v3
+      if: always() && steps.run-summary.outcome == 'failure'
+      with:
+        name: pr-summary
+        path: pr-summary.csv
+
+    - name: Upload baseline-unresolved.csv
+      uses: actions/upload-artifact@v3
+      if: always() && steps.run-summary.outcome == 'success'
+      with:
+        name: baseline-unresolved
+        path: baseline-unresolved.csv

From 469b7701ae15ada6e75cb66a4e5c2c0a744622b2 Mon Sep 17 00:00:00 2001
From: Kingdon Barrett <kingdon@weave.works>
Date: Mon, 14 Aug 2023 23:51:14 -0400
Subject: [PATCH 12/12] github scripts and test workflow

Signed-off-by: Kingdon Barrett <kingdon@weave.works>
---
 .github/scripts/check_summary.sh | 10 ++++++++++
 .github/scripts/comment_on_pr.sh |  9 +++++++++
 .github/workflows/test.yml       | 29 +++++++++++++++++++++++++++++
 3 files changed, 48 insertions(+)
 create mode 100755 .github/scripts/check_summary.sh
 create mode 100755 .github/scripts/comment_on_pr.sh
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/scripts/check_summary.sh b/.github/scripts/check_summary.sh
new file mode 100755
index 0000000..f5ee5a3
--- /dev/null
+++ b/.github/scripts/check_summary.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+set -e
+
+LINE_COUNT=$(wc -l < pr-summary.csv)
+
+if [ "$LINE_COUNT" -gt 1 ]; then
+    echo "Issues found in PR. Attaching pr-summary.csv for review..."
+else
+    echo "No direct issues found in PR. Attaching baseline-unresolved.csv for reference..."
+fi
diff --git a/.github/scripts/comment_on_pr.sh b/.github/scripts/comment_on_pr.sh
new file mode 100755
index 0000000..8336e70
--- /dev/null
+++ b/.github/scripts/comment_on_pr.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -e
+
+LINE_COUNT=$(wc -l < pr-summary.csv)
+
+if [ "$LINE_COUNT" -le 1 ]; then
+    # Using GitHub CLI to comment on the PR.
+    gh pr comment ${{ github.event.pull_request.number }} --body "Warning: Some unresolved baseline issues are present. Please check the attached baseline-unresolved.csv."
+fi
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..c64a26c
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,29 @@
+name: Test Action Workflow
+
+on:
+  workflow_dispatch:
+    inputs:
+      prNumber:
+        description: 'PR number to test against'
+        required: true
+        default: '1573'
+
+jobs:
+  test-action:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Ruby 3.0
+      uses: ruby/setup-ruby@v1
+      with:
+        ruby-version: 3.0
+        bundler-cache: true
+
+    - name: Run the action
+      uses: ./action/
+      with:
+        token: ${{ secrets.GITHUB_TOKEN }}
+        prNumber: ${{ github.event.inputs.prNumber }}