Skip to content

Commit

Permalink
Generate sitemap for Google Webmaster Tools.
Browse files Browse the repository at this point in the history
  • Loading branch information
lazyatom committed Nov 14, 2013
1 parent ea76681 commit 088a477
Showing 1 changed file with 308 additions and 0 deletions.
308 changes: 308 additions & 0 deletions _plugins/sitemap_generator.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,308 @@
# Sitemap.xml Generator is a Jekyll plugin that generates a sitemap.xml file by
# traversing all of the available posts and pages.
#
# How To Use:
# 1.) Copy source file into your _plugins folder within your Jekyll project.
# 2.) Change MY_URL to reflect your domain name.
# 3.) Change SITEMAP_FILE_NAME if you want your sitemap to be called something
# other than sitemap.xml.
# 4.) Change the PAGES_INCLUDE_POSTS list to include any pages that are looping
# through your posts (e.g. "index.html", "archive.html", etc.). This will
# ensure that right after you make a new post, the last modified date will
# be updated to reflect the new post.
# 5.) Run Jekyll: jekyll --server to re-generate your site.
# 6.) A sitemap.xml should be included in your _site folder.
#
# Customizations:
# 1.) If there are any files you don't want included in the sitemap, add them
# to the EXCLUDED_FILES list. The name should match the name of the source
# file.
# 2.) If you want to include the optional changefreq and priority attributes,
# simply include custom variables in the YAML Front Matter of that file.
# The names of these custom variables are defined below in the
# CHANGE_FREQUENCY_CUSTOM_VARIABLE_NAME and PRIORITY_CUSTOM_VARIABLE_NAME
# constants.
#
# Notes:
# 1.) The last modified date is determined by the latest from the following:
# system modified date of the page or post, system modified date of
# included layout, system modified date of included layout within that
# layout, ...
#
# Author: Michael Levin
# Site: http://www.kinnetica.com
# Distributed Under A Creative Commons License
# - http://creativecommons.org/licenses/by/3.0/

require 'rexml/document'

module Jekyll

# Change MY_URL to reflect the site you are using
MY_URL = "http://exciting.io"

# Change SITEMAP_FILE_NAME if you would like your sitemap file
# to be called something else
SITEMAP_FILE_NAME = "sitemap.xml"

# Any files to exclude from being included in the sitemap.xml
EXCLUDED_FILES = ["feed.xml"]

# Any files that include posts, so that when a new post is added, the last
# modified date of these pages should take that into account
PAGES_INCLUDE_POSTS = ["index.html", "blog.html"]

# Custom variable names for changefreq and priority elements
# These names are used within the YAML Front Matter of pages or posts
# for which you want to include these properties
CHANGE_FREQUENCY_CUSTOM_VARIABLE_NAME = "change_frequency"
PRIORITY_CUSTOM_VARIABLE_NAME = "priority"

class Post
attr_accessor :name

def full_path_to_source
File.join(@base, @name)
end

def location_on_server
"#{MY_URL}#{url}"
end
end

class Page
attr_accessor :name

def full_path_to_source
File.join(@base, @dir, @name)
end

def location_on_server
location = "#{MY_URL}#{@dir}#{url}"
location.gsub(/index.html$/, "")
end
end

class Layout
def full_path_to_source
File.join(@base, @name)
end
end

# Recover from strange exception when starting server without --auto
class SitemapFile < StaticFile
def write(dest)
begin
super(dest)
rescue
end

true
end
end

class SitemapGenerator < Generator

# Valid values allowed by sitemap.xml spec for change frequencies
VALID_CHANGE_FREQUENCY_VALUES = ["always", "hourly", "daily", "weekly",
"monthly", "yearly", "never"]

# Goes through pages and posts and generates sitemap.xml file
#
# Returns nothing
def generate(site)
sitemap = REXML::Document.new << REXML::XMLDecl.new("1.0", "UTF-8")

urlset = REXML::Element.new "urlset"
urlset.add_attribute("xmlns",
"http://www.sitemaps.org/schemas/sitemap/0.9")

@last_modified_post_date = fill_posts(site, urlset)
fill_pages(site, urlset)

sitemap.add_element(urlset)

# File I/O: create sitemap.xml file and write out pretty-printed XML
file = File.new(File.join(site.dest, SITEMAP_FILE_NAME), "w")
formatter = REXML::Formatters::Pretty.new(4)
formatter.compact = true
formatter.write(sitemap, file)
file.close

# Keep the sitemap.xml file from being cleaned by Jekyll
site.static_files << Jekyll::SitemapFile.new(site, site.dest, "/", SITEMAP_FILE_NAME)
end

# Create url elements for all the posts and find the date of the latest one
#
# Returns last_modified_date of latest post
def fill_posts(site, urlset)
last_modified_date = nil
site.posts.each do |post|
if !excluded?(post.name)
url = fill_url(site, post)
urlset.add_element(url)
end

path = post.full_path_to_source
date = File.mtime(path)
last_modified_date = date if last_modified_date == nil or date > last_modified_date
end

last_modified_date
end

# Create url elements for all the normal pages and find the date of the
# index to use with the pagination pages
#
# Returns last_modified_date of index page
def fill_pages(site, urlset)
site.pages.each do |page|
if !excluded?(page.name)
path = page.full_path_to_source
if File.exists?(path)
url = fill_url(site, page)
urlset.add_element(url)
end
end
end
end

# Fill data of each URL element: location, last modified,
# change frequency (optional), and priority.
#
# Returns url REXML::Element
def fill_url(site, page_or_post)
url = REXML::Element.new "url"

loc = fill_location(page_or_post)
url.add_element(loc)

lastmod = fill_last_modified(site, page_or_post)
url.add_element(lastmod) if lastmod

if (page_or_post.data[CHANGE_FREQUENCY_CUSTOM_VARIABLE_NAME])
change_frequency =
page_or_post.data[CHANGE_FREQUENCY_CUSTOM_VARIABLE_NAME].downcase

if (valid_change_frequency?(change_frequency))
changefreq = REXML::Element.new "changefreq"
changefreq.text = change_frequency
url.add_element(changefreq)
else
puts "ERROR: Invalid Change Frequency In #{page_or_post.name}"
end
end

if (page_or_post.data[PRIORITY_CUSTOM_VARIABLE_NAME])
priority_value = page_or_post.data[PRIORITY_CUSTOM_VARIABLE_NAME]
if valid_priority?(priority_value)
priority = REXML::Element.new "priority"
priority.text = page_or_post.data[PRIORITY_CUSTOM_VARIABLE_NAME]
url.add_element(priority)
else
puts "ERROR: Invalid Priority In #{page_or_post.name}"
end
end

url
end

# Get URL location of page or post
#
# Returns the location of the page or post
def fill_location(page_or_post)
loc = REXML::Element.new "loc"
loc.text = page_or_post.location_on_server

loc
end

# Fill lastmod XML element with the last modified date for the page or post.
#
# Returns lastmod REXML::Element or nil
def fill_last_modified(site, page_or_post)
path = page_or_post.full_path_to_source

lastmod = REXML::Element.new "lastmod"
date = File.mtime(path)
latest_date = find_latest_date(date, site, page_or_post)

if @last_modified_post_date == nil
# This is a post
lastmod.text = latest_date.iso8601
else
# This is a page
if posts_included?(page_or_post.name)
# We want to take into account the last post date
final_date = greater_date(latest_date, @last_modified_post_date)
lastmod.text = final_date.iso8601
else
lastmod.text = latest_date.iso8601
end
end
lastmod
end

# Go through the page/post and any implemented layouts and get the latest
# modified date
#
# Returns formatted output of latest date of page/post and any used layouts
def find_latest_date(latest_date, site, page_or_post)
layouts = site.layouts
layout = layouts[page_or_post.data["layout"]]
while layout
path = layout.full_path_to_source
date = File.mtime(path)

latest_date = date if (date > latest_date)

layout = layouts[layout.data["layout"]]
end

latest_date
end

# Which of the two dates is later
#
# Returns latest of two dates
def greater_date(date1, date2)
if (date1 >= date2)
date1
else
date2
end
end

# Is the page or post listed as something we want to exclude?
#
# Returns boolean
def excluded?(name)
EXCLUDED_FILES.include? name
end

def posts_included?(name)
PAGES_INCLUDE_POSTS.include? name
end

# Is the change frequency value provided valid according to the spec
#
# Returns boolean
def valid_change_frequency?(change_frequency)
VALID_CHANGE_FREQUENCY_VALUES.include? change_frequency
end

# Is the priority value provided valid according to the spec
#
# Returns boolean
def valid_priority?(priority)
begin
priority_val = Float(priority)
return true if priority_val >= 0.0 and priority_val <= 1.0
rescue ArgumentError
end

false
end
end
end

0 comments on commit 088a477

Please sign in to comment.