diff --git a/apicache/0c28bbdde90ce46c40c2adaf87f7339ec19a729dcb00249ff42f41f8dc1f8142 b/apicache/0c28bbdde90ce46c40c2adaf87f7339ec19a729dcb00249ff42f41f8dc1f8142 deleted file mode 100644 index 687e888..0000000 Binary files a/apicache/0c28bbdde90ce46c40c2adaf87f7339ec19a729dcb00249ff42f41f8dc1f8142 and /dev/null differ diff --git a/apicache/3153446787d457631aa6ec636af843866c871f5f56b09283e0246b7b9b7addba b/apicache/3153446787d457631aa6ec636af843866c871f5f56b09283e0246b7b9b7addba deleted file mode 100644 index 1eca06d..0000000 Binary files a/apicache/3153446787d457631aa6ec636af843866c871f5f56b09283e0246b7b9b7addba and /dev/null differ diff --git a/apicache/51852b6e6c8ad5c02d87b0d5c643324a490c00b02cb5151c3423567416b6418a b/apicache/51852b6e6c8ad5c02d87b0d5c643324a490c00b02cb5151c3423567416b6418a deleted file mode 100644 index 5507912..0000000 Binary files a/apicache/51852b6e6c8ad5c02d87b0d5c643324a490c00b02cb5151c3423567416b6418a and /dev/null differ diff --git a/apicache/6c747ab03ef6057844368f3b049a1404d70481ded4bac6d14a1eb12f2c9177dd b/apicache/6c747ab03ef6057844368f3b049a1404d70481ded4bac6d14a1eb12f2c9177dd deleted file mode 100644 index 015b047..0000000 Binary files a/apicache/6c747ab03ef6057844368f3b049a1404d70481ded4bac6d14a1eb12f2c9177dd and /dev/null differ diff --git a/apicache/7f19f9d02421f697179c40f256cc2bc26874d465db53e709c6554e645592b0a1 b/apicache/7f19f9d02421f697179c40f256cc2bc26874d465db53e709c6554e645592b0a1 deleted file mode 100644 index 0cdb3b6..0000000 Binary files a/apicache/7f19f9d02421f697179c40f256cc2bc26874d465db53e709c6554e645592b0a1 and /dev/null differ diff --git a/apicache/9f4738a9683ad8317eccf68a9f0ac99733ad245f8591f27775196ff85281262f b/apicache/9f4738a9683ad8317eccf68a9f0ac99733ad245f8591f27775196ff85281262f deleted file mode 100644 index 9f79f13..0000000 Binary files a/apicache/9f4738a9683ad8317eccf68a9f0ac99733ad245f8591f27775196ff85281262f and /dev/null differ diff --git a/apicache/c8cfa01cfa3d31d40333531ef714e97d7a0ded341ad1ee57a084cb1ede53f371 b/apicache/c8cfa01cfa3d31d40333531ef714e97d7a0ded341ad1ee57a084cb1ede53f371 deleted file mode 100644 index 7f360d3..0000000 Binary files a/apicache/c8cfa01cfa3d31d40333531ef714e97d7a0ded341ad1ee57a084cb1ede53f371 and /dev/null differ diff --git a/apicache/e87d1f5aa0e9fc68dbb97d738cde91273e2ed3281fc390f8420535e7b0548f79 b/apicache/e87d1f5aa0e9fc68dbb97d738cde91273e2ed3281fc390f8420535e7b0548f79 deleted file mode 100644 index 6ad1442..0000000 Binary files a/apicache/e87d1f5aa0e9fc68dbb97d738cde91273e2ed3281fc390f8420535e7b0548f79 and /dev/null differ diff --git a/app copy 2.js b/app copy 2.js new file mode 100644 index 0000000..f893701 --- /dev/null +++ b/app copy 2.js @@ -0,0 +1,139 @@ +// Backup from 10/25 before adding jobs table +const jobslist = document.getElementById('jobslist'); +const searchBar = document.getElementById('searchBar'); +const mainCategory = document.getElementById('mainCategory'); +let jobData = []; + +// Event listener for the search bar +searchBar.addEventListener('keyup', (e) => { + const searchString = e.target.value.toLowerCase(); + const searchWords = searchString.split(' ').filter(word => word.length > 0); + + if (searchWords.length === 0) { + displayJobs(jobData.flatMap(category => { + return Object.entries(category.jobs).map(([jobTitle, job]) => ({ + main_category: category.main_category, + jobTitle, + job + })); + })); + return; + } + + const filteredJobs = jobData.flatMap(category => { + return Object.entries(category.jobs) + .filter(([jobTitle, job]) => { + const jobTitleMatch = searchWords.some(word => jobTitle.toLowerCase().includes(word)); + const linksMatch = job.links.some(link => + searchWords.some(word => + link.url.toLowerCase().includes(word) || + link.category.toLowerCase().includes(word) + ) + ); + return jobTitleMatch || linksMatch; + }) + .map(([jobTitle, job]) => ({ + main_category: category.main_category, + jobTitle, + job + })); + }); + + displayJobs(filteredJobs); +}); + +// Function to load jobs data +const loadJobs = async () => { + try { + const res = await fetch('jobs.json'); // Adjust the path if necessary + jobData = await res.json(); // Load all categories and jobs + displayJobs(jobData.flatMap(category => { + return Object.entries(category.jobs).map(([jobTitle, job]) => ({ + main_category: category.main_category, + jobTitle, + job + })); + })); + } catch (err) { + console.error(err); + } +}; + +const displayJobs = (jobs) => { + let lastCategory = ''; + const htmlString = jobs.map(({ main_category, jobTitle, job }) => { + const isNewCategory = main_category !== lastCategory; + lastCategory = main_category; + + const linksHtml = job.links.map(link => { + return ` + + `; + }).join(''); + + const videosHtml = job.videos.map(video => { + const videoId = extractVideoId(video.url); + const thumbnailUrl = `https://img.youtube.com/vi/${videoId}/hqdefault.jpg`; + + return ` +
+ Video thumbnail +
+ `; + }).join(''); + + // Display degree required in a separate text box next to the job title + return ` + ${isNewCategory ? `

${main_category}

` : ''} +
+

${jobTitle}

+ Degree Required: ${job.degree_required} + +
${videosHtml}
+
+ `; + }).join(''); + + jobslist.innerHTML = htmlString; + updateMainCategory(jobs); + + // Add event listeners for lazy loading videos + document.querySelectorAll('.video-thumbnail').forEach(thumbnail => { + thumbnail.addEventListener('click', function() { + const videoWrapper = this.closest('.video-wrapper'); + const videoId = videoWrapper.dataset.videoId; + + const iframe = document.createElement('iframe'); + iframe.src = `https://www.youtube.com/embed/${videoId}?autoplay=1`; + iframe.setAttribute('frameborder', '0'); + iframe.setAttribute('allow', 'accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture'); + iframe.setAttribute('allowfullscreen', true); + iframe.setAttribute('loading', 'lazy'); + + videoWrapper.innerHTML = ''; // Clear the thumbnail + videoWrapper.appendChild(iframe); + }); + }); +}; + +// Function to update the main category display +const updateMainCategory = (jobs) => { + const firstJob = jobs[0]; + if (firstJob) { + mainCategory.textContent = firstJob.main_category; + } else { + mainCategory.textContent = ''; // Clear the main category if no jobs + } +}; + +// Function to extract the YouTube video ID from the URL +const extractVideoId = (url) => { + const urlParams = new URLSearchParams(new URL(url).search); + return urlParams.get('v') || url.split('/').pop(); +}; + +// Load jobs data when the page is ready +document.addEventListener('DOMContentLoaded', loadJobs); diff --git a/app copy 3.js b/app copy 3.js new file mode 100644 index 0000000..7019a21 --- /dev/null +++ b/app copy 3.js @@ -0,0 +1,144 @@ +// 10/25/24 backup +const jobslist = document.getElementById('jobslist'); +const searchBar = document.getElementById('searchBar'); +const mainCategory = document.getElementById('mainCategory'); +let jobData = []; + +// Event listener for the search bar +searchBar.addEventListener('keyup', (e) => { + const searchString = e.target.value.toLowerCase(); + const searchWords = searchString.split(' ').filter(word => word.length > 0); + + if (searchWords.length === 0) { + displayJobs(jobData.flatMap(category => { + return Object.entries(category.jobs).map(([jobTitle, job]) => ({ + main_category: category.main_category, + jobTitle, + job + })); + })); + return; + } + + const filteredJobs = jobData.flatMap(category => { + return Object.entries(category.jobs) + .filter(([jobTitle, job]) => { + const jobTitleMatch = searchWords.some(word => jobTitle.toLowerCase().includes(word)); + const linksMatch = job.links.some(link => + searchWords.some(word => + link.url.toLowerCase().includes(word) || + link.category.toLowerCase().includes(word) + ) + ); + return jobTitleMatch || linksMatch; + }) + .map(([jobTitle, job]) => ({ + main_category: category.main_category, + jobTitle, + job + })); + }); + + displayJobs(filteredJobs); +}); + +// Function to load jobs data +const loadJobs = async () => { + try { + const res = await fetch('jobs.json'); // Adjust the path if necessary + jobData = await res.json(); // Load all categories and jobs + displayJobs(jobData.flatMap(category => { + return Object.entries(category.jobs).map(([jobTitle, job]) => ({ + main_category: category.main_category, + jobTitle, + job + })); + })); + } catch (err) { + console.error(err); + } +}; + +const displayJobs = (jobs) => { + let lastCategory = ''; + const htmlString = jobs.map(({ main_category, jobTitle, job }) => { + const isNewCategory = main_category !== lastCategory; + lastCategory = main_category; + + const linksHtml = job.links.map(link => { + return ` + + `; + }).join(''); + + const videosHtml = job.videos.map(video => { + const videoId = extractVideoId(video.url); + const thumbnailUrl = `https://img.youtube.com/vi/${videoId}/hqdefault.jpg`; + + return ` +
+ Video thumbnail +
+ `; + }).join(''); + + // Check if a jobs table exists for the job + const jobsTableHtml = job.jobs_table ? job.jobs_table : ''; + + // Display degree required in a separate text box next to the job title + return ` + ${isNewCategory ? `

${main_category}

` : ''} +
+

${jobTitle}

+ Degree Required: ${job.degree_required} + +
${videosHtml}
+ +
${jobsTableHtml}
+
+ `; + }).join(''); + + jobslist.innerHTML = htmlString; + updateMainCategory(jobs); + + // Add event listeners for lazy loading videos + document.querySelectorAll('.video-thumbnail').forEach(thumbnail => { + thumbnail.addEventListener('click', function() { + const videoWrapper = this.closest('.video-wrapper'); + const videoId = videoWrapper.dataset.videoId; + + const iframe = document.createElement('iframe'); + iframe.src = `https://www.youtube.com/embed/${videoId}?autoplay=1`; + iframe.setAttribute('frameborder', '0'); + iframe.setAttribute('allow', 'accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture'); + iframe.setAttribute('allowfullscreen', true); + iframe.setAttribute('loading', 'lazy'); + + videoWrapper.innerHTML = ''; // Clear the thumbnail + videoWrapper.appendChild(iframe); + }); + }); +}; + +// Function to update the main category display +const updateMainCategory = (jobs) => { + const firstJob = jobs[0]; + if (firstJob) { + mainCategory.textContent = firstJob.main_category; + } else { + mainCategory.textContent = ''; // Clear the main category if no jobs + } +}; + +// Function to extract the YouTube video ID from the URL +const extractVideoId = (url) => { + const urlParams = new URLSearchParams(new URL(url).search); + return urlParams.get('v') || url.split('/').pop(); +}; + +// Load jobs data when the page is ready +document.addEventListener('DOMContentLoaded', loadJobs); diff --git a/app.css b/app.css index 85a4018..505d6f5 100644 --- a/app.css +++ b/app.css @@ -326,3 +326,99 @@ h3 { .degree-box.green { background-color: #28a745; /* Green for Graduate degree */ } + + +/* Basic styling for the body */ +body { + font-family: Arial, sans-serif; + background-color: #f9f9f9; + margin: 20px; +} + +/* Style for the table */ +table { + width: 50%; /* Set table width to 50% for left half of the screen */ + /* border-collapse: collapse; */ + margin: 20px 0; + box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1); +} + +/* Style for table headers */ +th { + background-color: #4CAF50; + color: white; + padding: 10px; + text-align: left; /* Align text in headers to the left */ +} + +/* Style for table cells */ +td { + padding: 8px; /* Reduced padding for less whitespace */ + /* border: 1px solid #ddd; */ + text-align: left; /* Left-align text in cells */ + white-space: nowrap; /* Prevent text wrapping */ +} + +/* Hover effect for table rows */ +tr:hover { + background-color: #ff000015; +} + +/* Responsive table */ +@media (max-width: 600px) { + table { + width: 100%; + display: block; + overflow-x: auto; + } +} + + +/* Basic styling for the body */ +body { + font-family: Arial, sans-serif; + background-color: #f9f9f9; + margin: 20px; +} + +/* Style for the table */ +table { + width: 60%; /* Set table width to 60% to make it a bit wider */ + border-collapse: collapse; + margin: 20px auto; /* Center-align the table */ + box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1); +} + +/* Style for table headers */ +th { + background-color: #4CAF50; + color: white; + padding: 10px; + text-align: left; /* Align text in headers to the left */ +} + +/* Style for table cells */ +td { + padding: 8px; /* Reduced padding for less whitespace */ + text-align: left; /* Left-align text in cells */ + white-space: nowrap; /* Prevent text wrapping */ +} + +/* Wider first column */ +td:first-child, th:first-child { + width: 40%; /* Adjust this width as necessary for the first column */ +} + +/* Hover effect for table rows */ +tr:hover { + background-color: #ff000015; +} + +/* Responsive table */ +@media (max-width: 600px) { + table { + width: 100%; + display: block; + overflow-x: auto; + } +} diff --git a/app.js b/app.js index af610fe..ce6a4cc 100644 --- a/app.js +++ b/app.js @@ -2,29 +2,76 @@ const jobslist = document.getElementById('jobslist'); const searchBar = document.getElementById('searchBar'); const mainCategory = document.getElementById('mainCategory'); let jobData = []; +let displayedJobs = []; +let filteredJobs = []; +const chunkSize = 20; // Number of jobs to load at a time +let currentIndex = 0; // Event listener for the search bar searchBar.addEventListener('keyup', (e) => { const searchString = e.target.value.toLowerCase(); const searchWords = searchString.split(' ').filter(word => word.length > 0); + + // Debouncing for search to reduce load during typing + clearTimeout(window.searchTimeout); + window.searchTimeout = setTimeout(() => { + if (searchWords.length === 0) { + resetJobs(); // Reset to original job data + return; + } + filteredJobs = filterJobs(searchWords); + displayedJobs = []; // Clear displayed jobs for new search results + currentIndex = 0; // Reset index for filtered results + loadMoreJobs(); // Load the first chunk of filtered jobs + }, 300); // Adjust timeout as needed +}); - if (searchWords.length === 0) { - displayJobs(jobData.flatMap(category => { - return Object.entries(category.jobs).map(([jobTitle, job]) => ({ - main_category: category.main_category, - jobTitle, - job - })); - })); - return; +// Function to load jobs data +const loadJobs = async () => { + try { + const res = await fetch('jobs.json'); // Adjust the path if necessary + jobData = await res.json(); // Load all categories and jobs + loadMoreJobs(); // Load the initial chunk of jobs + } catch (err) { + console.error(err); } +}; + +// Function to load more jobs +const loadMoreJobs = () => { + const jobsToLoad = filteredJobs.length > 0 ? filteredJobs : jobData.flatMap(category => { + return Object.entries(category.jobs).map(([jobTitle, job]) => ({ + main_category: category.main_category, + jobTitle, + job + })); + }); + + const nextJobs = jobsToLoad.slice(currentIndex, currentIndex + chunkSize); + + if (nextJobs.length === 0) return; // No more jobs to load + + displayedJobs = [...displayedJobs, ...nextJobs]; + currentIndex += chunkSize; - const filteredJobs = jobData.flatMap(category => { + displayJobs(displayedJobs); +}; + +// Function to reset jobs when search input is cleared +const resetJobs = () => { + displayedJobs = []; + currentIndex = 0; + loadMoreJobs(); // Load the first chunk again +}; + +// Function to filter jobs based on search input +const filterJobs = (searchWords) => { + return jobData.flatMap(category => { return Object.entries(category.jobs) .filter(([jobTitle, job]) => { const jobTitleMatch = searchWords.some(word => jobTitle.toLowerCase().includes(word)); const linksMatch = job.links.some(link => - searchWords.some(word => + searchWords.some(word => link.url.toLowerCase().includes(word) || link.category.toLowerCase().includes(word) ) @@ -37,25 +84,6 @@ searchBar.addEventListener('keyup', (e) => { job })); }); - - displayJobs(filteredJobs); -}); - -// Function to load jobs data -const loadJobs = async () => { - try { - const res = await fetch('jobs.json'); // Adjust the path if necessary - jobData = await res.json(); // Load all categories and jobs - displayJobs(jobData.flatMap(category => { - return Object.entries(category.jobs).map(([jobTitle, job]) => ({ - main_category: category.main_category, - jobTitle, - job - })); - })); - } catch (err) { - console.error(err); - } }; const displayJobs = (jobs) => { @@ -64,42 +92,48 @@ const displayJobs = (jobs) => { const isNewCategory = main_category !== lastCategory; lastCategory = main_category; - const linksHtml = job.links.map(link => { - return ` - - `; - }).join(''); - - const videosHtml = job.videos.map(video => { - const videoId = extractVideoId(video.url); - const thumbnailUrl = `https://img.youtube.com/vi/${videoId}/hqdefault.jpg`; - - return ` -
- Video thumbnail -
- `; - }).join(''); - - // Display degree required in a separate text box next to the job title return ` ${isNewCategory ? `

${main_category}

` : ''}

${jobTitle}

- Degree Required: ${job.degree_required} - -
${videosHtml}
+ Degree Required: ${job.degree_required} + +
${generateVideosHtml(job.videos)}
+
${job.jobs_table || ''}
`; }).join(''); jobslist.innerHTML = htmlString; updateMainCategory(jobs); + setupVideoThumbnails(); +}; + +// Function to generate links HTML +const generateLinksHtml = (links) => { + return links.map(link => ` + + `).join(''); +}; - // Add event listeners for lazy loading videos +// Function to generate videos HTML +const generateVideosHtml = (videos) => { + return videos.map(video => { + const videoId = extractVideoId(video.url); + const thumbnailUrl = `https://img.youtube.com/vi/${videoId}/hqdefault.jpg`; + return ` +
+ Video thumbnail +
+ `; + }).join(''); +}; + +// Function to setup lazy loading of videos +const setupVideoThumbnails = () => { document.querySelectorAll('.video-thumbnail').forEach(thumbnail => { thumbnail.addEventListener('click', function() { const videoWrapper = this.closest('.video-wrapper'); @@ -134,5 +168,12 @@ const extractVideoId = (url) => { return urlParams.get('v') || url.split('/').pop(); }; +// Infinite scrolling: load more jobs when scrolling to the bottom +window.addEventListener('scroll', () => { + if (window.innerHeight + window.scrollY >= document.body.offsetHeight) { + loadMoreJobs(); + } +}); + // Load jobs data when the page is ready document.addEventListener('DOMContentLoaded', loadJobs); diff --git a/job tables.html b/job tables.html index 8da62ba..e4559f2 100644 --- a/job tables.html +++ b/job tables.html @@ -1,11 +1,4 @@ - - - - - Quick Facts Table - - - +
@@ -36,7 +29,4 @@ -
Quick Facts: Real Estate Brokers and Sales Agents
10,100
Source: www.bls.gov
- - - \ No newline at end of file + \ No newline at end of file diff --git a/jobs.json b/jobs.json index be7b8e3..43cb741 100644 --- a/jobs.json +++ b/jobs.json @@ -23,7 +23,8 @@ "url": "https://www.youtube.com/watch?v=ovjmCc1R6EU" } ], - "degree_required": "Bachelor's" + "degree_required": "Bachelor's", + "jobs_table": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Quick Facts Table
\t\t\t\t\t\t\t\t\t\t2023 Median Pay\t\t\t\t\t\t\t\t\t\t \n $61,270 per year\n
$29.46 per hour\n
Typical Entry-Level Education High school diploma or equivalent
Number of Jobs, 2023 111,600
Job Outlook, 2023-33-7% (Decline)
Employment Change, 2023-33 -7,400
Source: www.bls.gov
" }, "Assistant Bank Manager": { "links": [ @@ -4450,7 +4451,8 @@ } ], "videos": [], - "degree_required": "Bachelor's" + "degree_required": "Bachelor's", + "jobs_table": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Quick Facts Table
\t\t\t\t\t\t\t\t\t\t2023 Median Pay\t\t\t\t\t\t\t\t\t\t \n $71,540 per year\n
$34.39 per hour\n
Typical Entry-Level Education Bachelor's degree
Number of Jobs, 2023 8,200
Job Outlook, 2023-336% (Faster than average)
Employment Change, 2023-33 500
Source: www.bls.gov
" }, "Conservation Officer": { "links": [ @@ -4713,7 +4715,8 @@ "url": "https://www.youtube.com/watch?v=vh4tRwjfTsA" } ], - "degree_required": "Bachelor's" + "degree_required": "Bachelor's", + "jobs_table": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Quick Facts Table
\t\t\t\t\t\t\t\t\t\t2023 Median Pay\t\t\t\t\t\t\t\t\t\t \n $60,970 per year\n
$29.31 per hour\n
Typical Entry-Level Education Associate's degree
Number of Jobs, 2023 366,200
Job Outlook, 2023-331% (Slower than average)
Employment Change, 2023-33 4,300
Source: www.bls.gov
" }, "Police Officer": { "links": [ @@ -9119,7 +9122,8 @@ "url": "http://www.youtube.com/watch?v=YrYs_9lEGyY" } ], - "degree_required": "Bachelor's" + "degree_required": "Bachelor's", + "jobs_table": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Quick Facts Table
\t\t\t\t\t\t\t\t\t\t2023 Median Pay\t\t\t\t\t\t\t\t\t\t \n $99,700 per year\n
$47.94 per hour\n
Typical Entry-Level Education Bachelor's degree
Number of Jobs, 2023 139,400
Job Outlook, 2023-33-10% (Decline)
Employment Change, 2023-33 -13,400
Source: www.bls.gov
" }, "Data Base Administrator": { "links": [ @@ -9353,7 +9357,8 @@ "url": "https://www.youtube.com/watch?v=trvCfinprNw" } ], - "degree_required": "Bachelor's" + "degree_required": "Bachelor's", + "jobs_table": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Quick Facts Table
\t\t\t\t\t\t\t\t\t\t2023 Median Pay\t\t\t\t\t\t\t\t\t\t \n $65,450 per year\n
$31.47 per hour\n
Typical Entry-Level Education See How to Become One
Number of Jobs, 2023 52,100
Job Outlook, 2023-3321% (Much faster than average)
Employment Change, 2023-33 10,800
Source: www.bls.gov
" }, "Physical Therapist Assistant": { "links": [ @@ -9952,7 +9957,8 @@ } ], "videos": [], - "degree_required": "Bachelor's" + "degree_required": "Bachelor's", + "jobs_table": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Quick Facts Table
\t\t\t\t\t\t\t\t\t\t2023 Median Pay\t\t\t\t\t\t\t\t\t\t \n \n $20.50 per hour\n
Typical Entry-Level Education Some college, no degree
Number of Jobs, 2023 76,000
Job Outlook, 2023-335% (As fast as average)
Employment Change, 2023-33 3,500
Source: www.bls.gov
" }, "Airline Pilot": { "links": [ diff --git a/make_website_abridged.ipynb b/make_website_abridged.ipynb index 19636bc..ca54820 100644 --- a/make_website_abridged.ipynb +++ b/make_website_abridged.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -161,11 +161,11 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "## Generating a site with jobs tables in json file" + ] } ], "metadata": { diff --git a/web_retrieval.ipynb b/web_retrieval.ipynb index 7dc0e68..058eaf6 100644 --- a/web_retrieval.ipynb +++ b/web_retrieval.ipynb @@ -2628,14 +2628,38 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Quick Facts table with source link has been successfully saved to 'job tables.html'\n" + "Error: Unable to find the Quick Facts table. Exception: Message: no such window: target window already closed\n", + "from unknown error: web view not found\n", + " (Session info: MicrosoftEdge=130.0.2849.46)\n", + "Stacktrace:\n", + "\tGetHandleVerifier [0x00007FF69682DC75+12853]\n", + "\tMicrosoft::Applications::Events::EventProperty::empty [0x00007FF696AD83C4+2250276]\n", + "\tMicrosoft::Applications::Events::EventProperty::empty [0x00007FF696A176A6+1460486]\n", + "\t(No symbol) [0x00007FF6965C84C7]\n", + "\t(No symbol) [0x00007FF69664CBED]\n", + "\t(No symbol) [0x00007FF696660D7A]\n", + "\t(No symbol) [0x00007FF696648163]\n", + "\t(No symbol) [0x00007FF69661FB54]\n", + "\t(No symbol) [0x00007FF69661F000]\n", + "\t(No symbol) [0x00007FF69661F741]\n", + "\tMicrosoft::Applications::Events::EventProperty::empty [0x00007FF6969ACDB4+1024020]\n", + "\tMicrosoft::Applications::Events::EventProperty::~EventProperty [0x00007FF696749A04+57108]\n", + "\tMicrosoft::Applications::Events::EventProperty::empty [0x00007FF6969ABDB3+1019923]\n", + "\tMicrosoft::Applications::Events::EventProperty::empty [0x00007FF6969ABA19+1019001]\n", + "\tMicrosoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6967CE031+394913]\n", + "\tMicrosoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6967CA954+380868]\n", + "\tMicrosoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6967CAA99+381193]\n", + "\tMicrosoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6967C2296+346374]\n", + "\tBaseThreadInitThunk [0x00007FF8A22F1FD7+23]\n", + "\tRtlUserThreadStart [0x00007FF8A2C5D7D0+32]\n", + "\n" ] } ], @@ -2735,6 +2759,1028 @@ "# Close the WebDriver after you're done\n", "# driver.quit() # Uncomment this line to close the browser after use\n" ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[2], line 219\u001b[0m\n\u001b[0;32m 216\u001b[0m output_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mjobs.json\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m 217\u001b[0m url \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://www.bls.gov/ooh/sales/real-estate-brokers-and-sales-agents.htm#tab-1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m--> 219\u001b[0m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart_line\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mend_line\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[1;32mIn[2], line 200\u001b[0m, in \u001b[0;36mmain\u001b[1;34m(file_path, start_line, end_line, output_file, url)\u001b[0m\n\u001b[0;32m 198\u001b[0m lines \u001b[38;5;241m=\u001b[39m read_docx(file_path, start_line, end_line)\n\u001b[0;32m 199\u001b[0m job_data \u001b[38;5;241m=\u001b[39m parse_lines(lines) \u001b[38;5;66;03m# Parse the job data from the Word document\u001b[39;00m\n\u001b[1;32m--> 200\u001b[0m html_table \u001b[38;5;241m=\u001b[39m \u001b[43mscrape_quick_facts\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Scrape the HTML table\u001b[39;00m\n\u001b[0;32m 202\u001b[0m \u001b[38;5;66;03m# Combine the job data with the HTML table\u001b[39;00m\n\u001b[0;32m 203\u001b[0m combined_data \u001b[38;5;241m=\u001b[39m {\n\u001b[0;32m 204\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mjobs\u001b[39m\u001b[38;5;124m\"\u001b[39m: job_data,\n\u001b[0;32m 205\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mjobs_table\u001b[39m\u001b[38;5;124m\"\u001b[39m: html_table\n\u001b[0;32m 206\u001b[0m }\n", + "Cell \u001b[1;32mIn[2], line 134\u001b[0m, in \u001b[0;36mscrape_quick_facts\u001b[1;34m(url)\u001b[0m\n\u001b[0;32m 131\u001b[0m driver\u001b[38;5;241m.\u001b[39mget(url)\n\u001b[0;32m 133\u001b[0m \u001b[38;5;66;03m# Allow time for the page to load\u001b[39;00m\n\u001b[1;32m--> 134\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Increase if necessary to allow the page to fully load\u001b[39;00m\n\u001b[0;32m 136\u001b[0m \u001b[38;5;66;03m# Try to find the Quick Facts table\u001b[39;00m\n\u001b[0;32m 137\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 138\u001b[0m \u001b[38;5;66;03m# Locate the Quick Facts table by its ID\u001b[39;00m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "import re\n", + "import json\n", + "from docx import Document\n", + "from docx.oxml.ns import qn\n", + "from selenium import webdriver\n", + "from selenium.webdriver.edge.service import Service\n", + "from selenium.webdriver.common.by import By\n", + "from webdriver_manager.microsoft import EdgeChromiumDriverManager\n", + "from bs4 import BeautifulSoup\n", + "import time\n", + "\n", + "# Function to check if run is highlighted and get the highlight color\n", + "def get_highlight_color(run):\n", + " highlight_elements = run.element.xpath('.//w:highlight')\n", + " if highlight_elements:\n", + " highlight_element = highlight_elements[0]\n", + " color = highlight_element.get(qn('w:val'))\n", + " return color\n", + " return None\n", + "\n", + "def read_docx(file_path, start_line, end_line):\n", + " doc = Document(file_path)\n", + " lines = []\n", + " line_count = 0\n", + " for paragraph in doc.paragraphs:\n", + " if start_line <= line_count < end_line:\n", + " para_text = paragraph.text.strip()\n", + " highlight_color = None\n", + " for run in paragraph.runs:\n", + " color = get_highlight_color(run)\n", + " if color:\n", + " highlight_color = color # Capture the first highlighted color in the paragraph\n", + " break\n", + " lines.append((para_text, highlight_color)) # Store line and highlight color as a tuple\n", + " line_count += 1\n", + " return lines\n", + "\n", + "def parse_lines(lines):\n", + " url_pattern = re.compile(r'https?://\\S+')\n", + " youtube_pattern = re.compile(r'(https?://(?:www\\.)?youtube\\.com/watch\\?v=[\\w-]+|https?://(?:www\\.)?youtu\\.be/[\\w-]+)')\n", + " data = []\n", + " current_main_category = None\n", + " current_job = None\n", + " jobs = {}\n", + "\n", + " for line, highlight_color in lines:\n", + " youtube_match = youtube_pattern.search(line)\n", + " if youtube_match:\n", + " # Extract the YouTube URL\n", + " youtube_url = youtube_match.group()\n", + " video_id = re.search(r'(?:v=|youtu\\.be/)([\\w-]+)', youtube_url).group(1)\n", + " \n", + " # Store the video ID instead of the iframe HTML\n", + " if current_job and current_job in jobs:\n", + " jobs[current_job]['videos'].append({\n", + " 'video_id': video_id, # Store only the video ID\n", + " 'url': youtube_url # Optionally store the original URL\n", + " })\n", + " elif url_pattern.search(line):\n", + " # Extract the URL\n", + " url_match = url_pattern.search(line)\n", + " url = url_match.group()\n", + " \n", + " # Extract the category (everything before the URL)\n", + " category = line[:url_match.start()].strip()\n", + "\n", + " if current_job and current_job in jobs:\n", + " jobs[current_job]['links'].append({\n", + " 'url': url,\n", + " 'category': category\n", + " })\n", + " elif line.lower().startswith(\"undefined\"):\n", + " # Skip lines starting with \"undefined\"\n", + " continue\n", + " else:\n", + " if current_main_category is None:\n", + " # Set the main category\n", + " current_main_category = line\n", + " elif current_job is None:\n", + " # Set the job title (sub_category)\n", + " current_job = line\n", + " jobs[current_job] = {\n", + " 'links': [],\n", + " 'videos': [],\n", + " 'degree_required': \"Bachelor's\" if highlight_color == 'cyan' else \"Graduate's\" if highlight_color == 'green' else ''\n", + " }\n", + " else:\n", + " # Handle a new main category if a new line appears\n", + " if line.strip() == \"\":\n", + " if current_main_category:\n", + " # Save the current main category and its jobs\n", + " data.append({\n", + " 'main_category': current_main_category,\n", + " 'jobs': jobs\n", + " })\n", + " # Reset for the next main category\n", + " current_main_category = None\n", + " jobs = {}\n", + " current_job = None\n", + " else:\n", + " # If it's neither a URL nor an empty line, it might be a new job\n", + " if current_job:\n", + " # Make sure to add the current job to jobs before changing\n", + " current_job = line\n", + " jobs[current_job] = {\n", + " 'links': [],\n", + " 'videos': [],\n", + " 'degree_required': \"Bachelor's\" if highlight_color == 'cyan' else \"Graduate's\" if highlight_color == 'green' else ''\n", + " }\n", + "\n", + " # Handle the last main category and jobs if they exist\n", + " if current_main_category and jobs:\n", + " data.append({\n", + " 'main_category': current_main_category,\n", + " 'jobs': jobs\n", + " })\n", + "\n", + " return data\n", + "\n", + "# Function to scrape the Quick Facts table and generate the HTML\n", + "def scrape_quick_facts(url):\n", + " # Set up the Edge WebDriver\n", + " options = webdriver.EdgeOptions()\n", + " # Do not use the headless option to see the browser window\n", + " # options.add_argument('--headless') # Do not use this line\n", + "\n", + " # Initialize the Edge WebDriver\n", + " driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)\n", + "\n", + " # Open the URL\n", + " driver.get(url)\n", + "\n", + " # Allow time for the page to load\n", + " time.sleep(5) # Increase if necessary to allow the page to fully load\n", + "\n", + " # Try to find the Quick Facts table\n", + " try:\n", + " # Locate the Quick Facts table by its ID\n", + " quickfacts_table = driver.find_element(By.ID, 'quickfacts')\n", + " \n", + " # Get the outer HTML of the table\n", + " quickfacts_html = quickfacts_table.get_attribute('outerHTML')\n", + " \n", + " # Parse the HTML using BeautifulSoup\n", + " soup = BeautifulSoup(quickfacts_html, 'html.parser')\n", + " \n", + " # Remove all links\n", + " for link in soup.find_all('a'):\n", + " link.unwrap() # Remove the link but keep the text\n", + " \n", + " # Remove specific rows\n", + " rows_to_remove = [\"Work Experience in a Related Occupation\", \"On-the-job Training\"]\n", + " for row in soup.find_all('tr'):\n", + " header = row.find('th')\n", + " if header and header.get_text(strip=True) in rows_to_remove:\n", + " row.decompose() # Remove the row entirely\n", + " \n", + " # Modify the first row to have a single cell that spans all columns\n", + " first_row = soup.find('tr')\n", + " first_cell = soup.new_tag('th')\n", + " first_cell['colspan'] = '2' # Set to span all columns\n", + " first_cell.string = 'Quick Facts: Real Estate Brokers and Sales Agents' # Change this to your desired title\n", + " first_row.clear() # Clear existing cells in the first row\n", + " first_row.append(first_cell) # Add the new cell\n", + " \n", + " # Add the source link at the bottom of the table\n", + " source_row = soup.new_tag('tr')\n", + " source_header = soup.new_tag('th')\n", + " source_header['colspan'] = '2' # Span across two columns\n", + " source_header.string = 'Source: '\n", + " \n", + " # Create the source link\n", + " source_link = soup.new_tag('a', href=url)\n", + " source_link.string = 'www.bls.gov'\n", + " source_header.append(source_link)\n", + " \n", + " source_row.append(source_header)\n", + " soup.find('tbody').append(source_row) # Add the source row to the table\n", + "\n", + " # Get the updated HTML without links and excluded rows\n", + " updated_quickfacts_html = str(soup)\n", + "\n", + " # Save the complete HTML to a file\n", + " with open(\"job tables.html\", \"w\", encoding=\"utf-8\") as file:\n", + " file.write(updated_quickfacts_html)\n", + "\n", + " print(\"Quick Facts table with source link has been successfully saved to 'job tables.html'\")\n", + "\n", + " return updated_quickfacts_html # Return the HTML for further processing\n", + "\n", + " except Exception as e:\n", + " print(\"Error: Unable to find the Quick Facts table. Exception:\", e)\n", + " finally:\n", + " # Close the WebDriver after you're done\n", + " driver.quit() # Close the browser\n", + "\n", + "def main(file_path, start_line, end_line, output_file, url):\n", + " lines = read_docx(file_path, start_line, end_line)\n", + " job_data = parse_lines(lines) # Parse the job data from the Word document\n", + " html_table = scrape_quick_facts(url) # Scrape the HTML table\n", + "\n", + " # Combine the job data with the HTML table\n", + " combined_data = {\n", + " \"jobs\": job_data,\n", + " \"jobs_table\": html_table\n", + " }\n", + "\n", + " # Save the combined data to a JSON file\n", + " with open(output_file, 'w') as f:\n", + " json.dump(combined_data, f, indent=4)\n", + "\n", + "# Specify the file path and line range\n", + "file_path = 'careers.docx'\n", + "start_line = 171 # Starting line (inclusive)\n", + "end_line = start_line + 3000 # Adjust the ending line as needed\n", + "output_file = 'jobs.json'\n", + "url = \"https://www.bls.gov/ooh/sales/real-estate-brokers-and-sales-agents.htm#tab-1\"\n", + "\n", + "main(file_path, start_line, end_line, output_file, url)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[1], line 220\u001b[0m\n\u001b[0;32m 217\u001b[0m end_line \u001b[38;5;241m=\u001b[39m start_line \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m3000\u001b[39m \u001b[38;5;66;03m# Adjust the ending line as needed\u001b[39;00m\n\u001b[0;32m 218\u001b[0m output_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mjobs.json\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 220\u001b[0m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart_line\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mend_line\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[1;32mIn[1], line 205\u001b[0m, in \u001b[0;36mmain\u001b[1;34m(file_path, start_line, end_line, output_file)\u001b[0m\n\u001b[0;32m 202\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m bls_links:\n\u001b[0;32m 203\u001b[0m \u001b[38;5;66;03m# Scrape the Quick Facts table for this job\u001b[39;00m\n\u001b[0;32m 204\u001b[0m url \u001b[38;5;241m=\u001b[39m bls_links[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;66;03m# Use the first bls.gov link found\u001b[39;00m\n\u001b[1;32m--> 205\u001b[0m html_table \u001b[38;5;241m=\u001b[39m \u001b[43mscrape_quick_facts\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Scrape the HTML table\u001b[39;00m\n\u001b[0;32m 206\u001b[0m job_info[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mjobs_table\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m html_table \u001b[38;5;66;03m# Add the HTML table to the job info\u001b[39;00m\n\u001b[0;32m 208\u001b[0m combined_data\u001b[38;5;241m.\u001b[39mappend(job_category) \u001b[38;5;66;03m# Add the job category with all jobs to combined data\u001b[39;00m\n", + "Cell \u001b[1;32mIn[1], line 134\u001b[0m, in \u001b[0;36mscrape_quick_facts\u001b[1;34m(url)\u001b[0m\n\u001b[0;32m 131\u001b[0m driver\u001b[38;5;241m.\u001b[39mget(url)\n\u001b[0;32m 133\u001b[0m \u001b[38;5;66;03m# Allow time for the page to load\u001b[39;00m\n\u001b[1;32m--> 134\u001b[0m \u001b[43mtime\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m5\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Increase if necessary to allow the page to fully load\u001b[39;00m\n\u001b[0;32m 136\u001b[0m \u001b[38;5;66;03m# Try to find the Quick Facts table\u001b[39;00m\n\u001b[0;32m 137\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 138\u001b[0m \u001b[38;5;66;03m# Locate the Quick Facts table by its ID\u001b[39;00m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "import re\n", + "import json\n", + "from docx import Document\n", + "from docx.oxml.ns import qn\n", + "from selenium import webdriver\n", + "from selenium.webdriver.edge.service import Service\n", + "from selenium.webdriver.common.by import By\n", + "from webdriver_manager.microsoft import EdgeChromiumDriverManager\n", + "from bs4 import BeautifulSoup\n", + "import time\n", + "\n", + "# Function to check if run is highlighted and get the highlight color\n", + "def get_highlight_color(run):\n", + " highlight_elements = run.element.xpath('.//w:highlight')\n", + " if highlight_elements:\n", + " highlight_element = highlight_elements[0]\n", + " color = highlight_element.get(qn('w:val'))\n", + " return color\n", + " return None\n", + "\n", + "def read_docx(file_path, start_line, end_line):\n", + " doc = Document(file_path)\n", + " lines = []\n", + " line_count = 0\n", + " for paragraph in doc.paragraphs:\n", + " if start_line <= line_count < end_line:\n", + " para_text = paragraph.text.strip()\n", + " highlight_color = None\n", + " for run in paragraph.runs:\n", + " color = get_highlight_color(run)\n", + " if color:\n", + " highlight_color = color # Capture the first highlighted color in the paragraph\n", + " break\n", + " lines.append((para_text, highlight_color)) # Store line and highlight color as a tuple\n", + " line_count += 1\n", + " return lines\n", + "\n", + "def parse_lines(lines):\n", + " url_pattern = re.compile(r'https?://\\S+')\n", + " youtube_pattern = re.compile(r'(https?://(?:www\\.)?youtube\\.com/watch\\?v=[\\w-]+|https?://(?:www\\.)?youtu\\.be/[\\w-]+)')\n", + " data = []\n", + " current_main_category = None\n", + " current_job = None\n", + " jobs = {}\n", + "\n", + " for line, highlight_color in lines:\n", + " youtube_match = youtube_pattern.search(line)\n", + " if youtube_match:\n", + " # Extract the YouTube URL\n", + " youtube_url = youtube_match.group()\n", + " video_id = re.search(r'(?:v=|youtu\\.be/)([\\w-]+)', youtube_url).group(1)\n", + " \n", + " # Store the video ID instead of the iframe HTML\n", + " if current_job and current_job in jobs:\n", + " jobs[current_job]['videos'].append({\n", + " 'video_id': video_id, # Store only the video ID\n", + " 'url': youtube_url # Optionally store the original URL\n", + " })\n", + " elif url_pattern.search(line):\n", + " # Extract the URL\n", + " url_match = url_pattern.search(line)\n", + " url = url_match.group()\n", + " \n", + " # Extract the category (everything before the URL)\n", + " category = line[:url_match.start()].strip()\n", + "\n", + " if current_job and current_job in jobs:\n", + " jobs[current_job]['links'].append({\n", + " 'url': url,\n", + " 'category': category\n", + " })\n", + " elif line.lower().startswith(\"undefined\"):\n", + " # Skip lines starting with \"undefined\"\n", + " continue\n", + " else:\n", + " if current_main_category is None:\n", + " # Set the main category\n", + " current_main_category = line\n", + " elif current_job is None:\n", + " # Set the job title (sub_category)\n", + " current_job = line\n", + " jobs[current_job] = {\n", + " 'links': [],\n", + " 'videos': [],\n", + " 'degree_required': \"Bachelor's\" if highlight_color == 'cyan' else \"Graduate's\" if highlight_color == 'green' else ''\n", + " }\n", + " else:\n", + " # Handle a new main category if a new line appears\n", + " if line.strip() == \"\":\n", + " if current_main_category:\n", + " # Save the current main category and its jobs\n", + " data.append({\n", + " 'main_category': current_main_category,\n", + " 'jobs': jobs\n", + " })\n", + " # Reset for the next main category\n", + " current_main_category = None\n", + " jobs = {}\n", + " current_job = None\n", + " else:\n", + " # If it's neither a URL nor an empty line, it might be a new job\n", + " if current_job:\n", + " # Make sure to add the current job to jobs before changing\n", + " current_job = line\n", + " jobs[current_job] = {\n", + " 'links': [],\n", + " 'videos': [],\n", + " 'degree_required': \"Bachelor's\" if highlight_color == 'cyan' else \"Graduate's\" if highlight_color == 'green' else ''\n", + " }\n", + "\n", + " # Handle the last main category and jobs if they exist\n", + " if current_main_category and jobs:\n", + " data.append({\n", + " 'main_category': current_main_category,\n", + " 'jobs': jobs\n", + " })\n", + "\n", + " return data\n", + "\n", + "# Function to scrape the Quick Facts table and generate the HTML\n", + "def scrape_quick_facts(url):\n", + " # Set up the Edge WebDriver\n", + " options = webdriver.EdgeOptions()\n", + " # Do not use the headless option to see the browser window\n", + " # options.add_argument('--headless') # Do not use this line\n", + "\n", + " # Initialize the Edge WebDriver\n", + " driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)\n", + "\n", + " # Open the URL\n", + " driver.get(url)\n", + "\n", + " # Allow time for the page to load\n", + " time.sleep(5) # Increase if necessary to allow the page to fully load\n", + "\n", + " # Try to find the Quick Facts table\n", + " try:\n", + " # Locate the Quick Facts table by its ID\n", + " quickfacts_table = driver.find_element(By.ID, 'quickfacts')\n", + " \n", + " # Get the outer HTML of the table\n", + " quickfacts_html = quickfacts_table.get_attribute('outerHTML')\n", + " \n", + " # Parse the HTML using BeautifulSoup\n", + " soup = BeautifulSoup(quickfacts_html, 'html.parser')\n", + " \n", + " # Remove all links\n", + " for link in soup.find_all('a'):\n", + " link.unwrap() # Remove the link but keep the text\n", + " \n", + " # Remove specific rows\n", + " rows_to_remove = [\"Work Experience in a Related Occupation\", \"On-the-job Training\"]\n", + " for row in soup.find_all('tr'):\n", + " header = row.find('th')\n", + " if header and header.get_text(strip=True) in rows_to_remove:\n", + " row.decompose() # Remove the row entirely\n", + " \n", + " # Modify the first row to have a single cell that spans all columns\n", + " first_row = soup.find('tr')\n", + " first_cell = soup.new_tag('th')\n", + " first_cell['colspan'] = '2' # Set to span all columns\n", + " first_cell.string = 'Quick Facts Table' # Change this to your desired title\n", + " first_row.clear() # Clear existing cells in the first row\n", + " first_row.append(first_cell) # Add the new cell\n", + " \n", + " # Add the source link at the bottom of the table\n", + " source_row = soup.new_tag('tr')\n", + " source_header = soup.new_tag('th')\n", + " source_header['colspan'] = '2' # Span across two columns\n", + " source_header.string = 'Source: '\n", + " \n", + " # Create the source link\n", + " source_link = soup.new_tag('a', href=url)\n", + " source_link.string = 'www.bls.gov'\n", + " source_header.append(source_link)\n", + " \n", + " source_row.append(source_header)\n", + " soup.find('tbody').append(source_row) # Add the source row to the table\n", + "\n", + " # Get the updated HTML without links and excluded rows\n", + " updated_quickfacts_html = str(soup)\n", + "\n", + " return updated_quickfacts_html # Return the HTML for further processing\n", + "\n", + " except Exception as e:\n", + " print(\"Error: Unable to find the Quick Facts table. Exception:\", e)\n", + " finally:\n", + " # Close the WebDriver after you're done\n", + " driver.quit() # Close the browser\n", + "\n", + "def main(file_path, start_line, end_line, output_file):\n", + " lines = read_docx(file_path, start_line, end_line)\n", + " job_data = parse_lines(lines) # Parse the job data from the Word document\n", + " combined_data = []\n", + "\n", + " # Loop through each job entry\n", + " for job_category in job_data:\n", + " jobs = job_category['jobs']\n", + " for job_title, job_info in jobs.items():\n", + " # Check if any of the job links is from bls.gov\n", + " bls_links = [link['url'] for link in job_info['links'] if 'bls.gov' in link['url']]\n", + " if bls_links:\n", + " # Scrape the Quick Facts table for this job\n", + " url = bls_links[0] # Use the first bls.gov link found\n", + " html_table = scrape_quick_facts(url) # Scrape the HTML table\n", + " job_info['jobs_table'] = html_table # Add the HTML table to the job info\n", + "\n", + " combined_data.append(job_category) # Add the job category with all jobs to combined data\n", + "\n", + " # Save the combined data to a JSON file\n", + " with open(output_file, 'w') as f:\n", + " json.dump(combined_data, f, indent=4)\n", + "\n", + "# Specify the file path and line range\n", + "file_path = 'careers.docx'\n", + "start_line = 171 # Starting line (inclusive)\n", + "end_line = start_line + 3000 # Adjust the ending line as needed\n", + "output_file = 'jobs.json'\n", + "\n", + "main(file_path, start_line, end_line, output_file)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing job: Advertising Sales Representative / Agent / Executive in category: Business, Advertising, and Finance\n" + ] + }, + { + "ename": "OSError", + "evalue": "[Errno 22] Invalid argument: 'jobs.json'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[1], line 216\u001b[0m\n\u001b[0;32m 213\u001b[0m end_line \u001b[38;5;241m=\u001b[39m start_line \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m3000\u001b[39m \u001b[38;5;66;03m# Adjust the ending line as needed\u001b[39;00m\n\u001b[0;32m 214\u001b[0m output_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mjobs.json\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m--> 216\u001b[0m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart_line\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mend_line\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[1;32mIn[1], line 204\u001b[0m, in \u001b[0;36mmain\u001b[1;34m(file_path, start_line, end_line, output_file)\u001b[0m\n\u001b[0;32m 201\u001b[0m combined_data\u001b[38;5;241m.\u001b[39mappend(job_category) \u001b[38;5;66;03m# Add the job category with all jobs to combined data\u001b[39;00m\n\u001b[0;32m 203\u001b[0m \u001b[38;5;66;03m# Save the combined data to a JSON file incrementally\u001b[39;00m\n\u001b[1;32m--> 204\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mw\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m 205\u001b[0m json\u001b[38;5;241m.\u001b[39mdump(combined_data, f, indent\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m)\n\u001b[0;32m 206\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mProgress saved: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(combined_data)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m job categories processed.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python313\\site-packages\\IPython\\core\\interactiveshell.py:324\u001b[0m, in \u001b[0;36m_modified_open\u001b[1;34m(file, *args, **kwargs)\u001b[0m\n\u001b[0;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[0;32m 318\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 319\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 320\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 321\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 322\u001b[0m )\n\u001b[1;32m--> 324\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[1;31mOSError\u001b[0m: [Errno 22] Invalid argument: 'jobs.json'" + ] + } + ], + "source": [ + "import re\n", + "import json\n", + "from docx import Document\n", + "from docx.oxml.ns import qn\n", + "from selenium import webdriver\n", + "from selenium.webdriver.edge.service import Service\n", + "from selenium.webdriver.common.by import By\n", + "from webdriver_manager.microsoft import EdgeChromiumDriverManager\n", + "from bs4 import BeautifulSoup\n", + "import time\n", + "\n", + "# Function to check if run is highlighted and get the highlight color\n", + "def get_highlight_color(run):\n", + " highlight_elements = run.element.xpath('.//w:highlight')\n", + " if highlight_elements:\n", + " highlight_element = highlight_elements[0]\n", + " color = highlight_element.get(qn('w:val'))\n", + " return color\n", + " return None\n", + "\n", + "def read_docx(file_path, start_line, end_line):\n", + " doc = Document(file_path)\n", + " lines = []\n", + " line_count = 0\n", + " for paragraph in doc.paragraphs:\n", + " if start_line <= line_count < end_line:\n", + " para_text = paragraph.text.strip()\n", + " highlight_color = None\n", + " for run in paragraph.runs:\n", + " color = get_highlight_color(run)\n", + " if color:\n", + " highlight_color = color # Capture the first highlighted color in the paragraph\n", + " break\n", + " lines.append((para_text, highlight_color)) # Store line and highlight color as a tuple\n", + " line_count += 1\n", + " return lines\n", + "\n", + "def parse_lines(lines):\n", + " url_pattern = re.compile(r'https?://\\S+')\n", + " youtube_pattern = re.compile(r'(https?://(?:www\\.)?youtube\\.com/watch\\?v=[\\w-]+|https?://(?:www\\.)?youtu\\.be/[\\w-]+)')\n", + " data = []\n", + " current_main_category = None\n", + " current_job = None\n", + " jobs = {}\n", + "\n", + " for line, highlight_color in lines:\n", + " youtube_match = youtube_pattern.search(line)\n", + " if youtube_match:\n", + " # Extract the YouTube URL\n", + " youtube_url = youtube_match.group()\n", + " video_id = re.search(r'(?:v=|youtu\\.be/)([\\w-]+)', youtube_url).group(1)\n", + " \n", + " # Store the video ID instead of the iframe HTML\n", + " if current_job and current_job in jobs:\n", + " jobs[current_job]['videos'].append({\n", + " 'video_id': video_id, # Store only the video ID\n", + " 'url': youtube_url # Optionally store the original URL\n", + " })\n", + " elif url_pattern.search(line):\n", + " # Extract the URL\n", + " url_match = url_pattern.search(line)\n", + " url = url_match.group()\n", + " \n", + " # Extract the category (everything before the URL)\n", + " category = line[:url_match.start()].strip()\n", + "\n", + " if current_job and current_job in jobs:\n", + " jobs[current_job]['links'].append({\n", + " 'url': url,\n", + " 'category': category\n", + " })\n", + " elif line.lower().startswith(\"undefined\"):\n", + " # Skip lines starting with \"undefined\"\n", + " continue\n", + " else:\n", + " if current_main_category is None:\n", + " # Set the main category\n", + " current_main_category = line\n", + " elif current_job is None:\n", + " # Set the job title (sub_category)\n", + " current_job = line\n", + " jobs[current_job] = {\n", + " 'links': [],\n", + " 'videos': [],\n", + " 'degree_required': \"Bachelor's\" if highlight_color == 'cyan' else \"Graduate's\" if highlight_color == 'green' else ''\n", + " }\n", + " else:\n", + " # Handle a new main category if a new line appears\n", + " if line.strip() == \"\":\n", + " if current_main_category:\n", + " # Save the current main category and its jobs\n", + " data.append({\n", + " 'main_category': current_main_category,\n", + " 'jobs': jobs\n", + " })\n", + " # Reset for the next main category\n", + " current_main_category = None\n", + " jobs = {}\n", + " current_job = None\n", + " else:\n", + " # If it's neither a URL nor an empty line, it might be a new job\n", + " if current_job:\n", + " # Make sure to add the current job to jobs before changing\n", + " current_job = line\n", + " jobs[current_job] = {\n", + " 'links': [],\n", + " 'videos': [],\n", + " 'degree_required': \"Bachelor's\" if highlight_color == 'cyan' else \"Graduate's\" if highlight_color == 'green' else ''\n", + " }\n", + "\n", + " # Handle the last main category and jobs if they exist\n", + " if current_main_category and jobs:\n", + " data.append({\n", + " 'main_category': current_main_category,\n", + " 'jobs': jobs\n", + " })\n", + "\n", + " return data\n", + "\n", + "# Function to scrape the Quick Facts table and generate the HTML\n", + "def scrape_quick_facts(driver, url):\n", + " driver.get(url) # Open the URL\n", + " time.sleep(5) # Allow time for the page to load\n", + "\n", + " try:\n", + " # Locate the Quick Facts table by its ID\n", + " quickfacts_table = driver.find_element(By.ID, 'quickfacts')\n", + " \n", + " # Get the outer HTML of the table\n", + " quickfacts_html = quickfacts_table.get_attribute('outerHTML')\n", + " \n", + " # Parse the HTML using BeautifulSoup\n", + " soup = BeautifulSoup(quickfacts_html, 'html.parser')\n", + " \n", + " # Remove all links\n", + " for link in soup.find_all('a'):\n", + " link.unwrap() # Remove the link but keep the text\n", + " \n", + " # Remove specific rows\n", + " rows_to_remove = [\"Work Experience in a Related Occupation\", \"On-the-job Training\"]\n", + " for row in soup.find_all('tr'):\n", + " header = row.find('th')\n", + " if header and header.get_text(strip=True) in rows_to_remove:\n", + " row.decompose() # Remove the row entirely\n", + " \n", + " # Modify the first row to have a single cell that spans all columns\n", + " first_row = soup.find('tr')\n", + " first_cell = soup.new_tag('th')\n", + " first_cell['colspan'] = '2' # Set to span all columns\n", + " first_cell.string = 'Quick Facts Table' # Change this to your desired title\n", + " first_row.clear() # Clear existing cells in the first row\n", + " first_row.append(first_cell) # Add the new cell\n", + " \n", + " # Add the source link at the bottom of the table\n", + " source_row = soup.new_tag('tr')\n", + " source_header = soup.new_tag('th')\n", + " source_header['colspan'] = '2' # Span across two columns\n", + " source_header.string = 'Source: '\n", + " \n", + " # Create the source link\n", + " source_link = soup.new_tag('a', href=url)\n", + " source_link.string = 'www.bls.gov'\n", + " source_header.append(source_link)\n", + " \n", + " source_row.append(source_header)\n", + " soup.find('tbody').append(source_row) # Add the source row to the table\n", + "\n", + " # Get the updated HTML without links and excluded rows\n", + " updated_quickfacts_html = str(soup)\n", + "\n", + " return updated_quickfacts_html # Return the HTML for further processing\n", + "\n", + " except Exception as e:\n", + " print(\"Error: Unable to find the Quick Facts table. Exception:\", e)\n", + " return None\n", + "\n", + "def main(file_path, start_line, end_line, output_file):\n", + " lines = read_docx(file_path, start_line, end_line)\n", + " job_data = parse_lines(lines) # Parse the job data from the Word document\n", + " combined_data = []\n", + "\n", + " # Set up the Edge WebDriver\n", + " options = webdriver.EdgeOptions()\n", + " driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)\n", + "\n", + " # Loop through each job entry\n", + " for job_category in job_data:\n", + " jobs = job_category['jobs']\n", + " for job_title, job_info in jobs.items():\n", + " print(f\"Processing job: {job_title} in category: {job_category['main_category']}\")\n", + " \n", + " # Check if any of the job links is from bls.gov\n", + " bls_links = [link['url'] for link in job_info['links'] if 'bls.gov' in link['url']]\n", + " if bls_links:\n", + " # Scrape the Quick Facts table for this job\n", + " url = bls_links[0] # Use the first bls.gov link found\n", + " html_table = scrape_quick_facts(driver, url) # Scrape the HTML table\n", + " if html_table:\n", + " job_info['jobs_table'] = html_table # Add the HTML table to the job info\n", + "\n", + " combined_data.append(job_category) # Add the job category with all jobs to combined data\n", + " \n", + " # Save the combined data to a JSON file incrementally\n", + " with open(output_file, 'w') as f:\n", + " json.dump(combined_data, f, indent=4)\n", + " print(f\"Progress saved: {len(combined_data)} job categories processed.\")\n", + "\n", + " driver.quit() # Close the browser\n", + "\n", + "# Specify the file path and line range\n", + "file_path = 'careers.docx'\n", + "start_line = 171 # Starting line (inclusive)\n", + "end_line = start_line + 3000 # Adjust the ending line as needed\n", + "output_file = 'jobs.json'\n", + "\n", + "main(file_path, start_line, end_line, output_file)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing job: Advertising Sales Representative / Agent / Executive in category: Business, Advertising, and Finance\n", + "Progress saved: 1 job categories processed.\n", + "Processing job: Child Development Specialist in category: Children and Families\n", + "Progress saved: 2 job categories processed.\n", + "Processing job: Crisis Intervention Counselor in category: Counseling\n", + "Progress saved: 3 job categories processed.\n", + "Processing job: Admissions Evaluator in category: Education\n", + "Progress saved: 4 job categories processed.\n", + "Processing job: Child Life Specialist in category: Health and Medical Services\n", + "Progress saved: 5 job categories processed.\n", + "Processing job: Benefits Manager in category: Human Resources\n", + "Progress saved: 6 job categories processed.\n", + "Processing job: Arbitrator, Mediator, Conciliator, or Ombudsman in category: Law and Law Enforcement\n", + "Progress saved: 7 job categories processed.\n", + "Processing job: Paralegal or Legal Assistant in category: Loss Prevention Manager\n", + "Progress saved: 8 job categories processed.\n", + "Processing job: Army Infantry Officer in category: Military\n", + "Progress saved: 9 job categories processed.\n", + "Processing job: Academic Psychologist in category: Psychology\n", + "Error: Unable to find the Quick Facts table. Exception: Message: no such element: Unable to locate element: {\"method\":\"css selector\",\"selector\":\"[id=\"quickfacts\"]\"}\n", + " (Session info: MicrosoftEdge=130.0.2849.46); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception\n", + "Stacktrace:\n", + "\tGetHandleVerifier [0x00007FF69682DC75+12853]\n", + "\tMicrosoft::Applications::Events::EventProperty::empty [0x00007FF696AD83C4+2250276]\n", + "\tMicrosoft::Applications::Events::EventProperty::empty [0x00007FF696A176A6+1460486]\n", + "\t(No symbol) [0x00007FF6966297CC]\n", + "\t(No symbol) [0x00007FF69662990C]\n", + "\t(No symbol) [0x00007FF696663857]\n", + "\t(No symbol) [0x00007FF69664853F]\n", + "\t(No symbol) [0x00007FF696620617]\n", + "\t(No symbol) [0x00007FF696661431]\n", + "\t(No symbol) [0x00007FF696648163]\n", + "\t(No symbol) [0x00007FF69661FB54]\n", + "\t(No symbol) [0x00007FF69661F000]\n", + "\t(No symbol) [0x00007FF69661F741]\n", + "\tMicrosoft::Applications::Events::EventProperty::empty [0x00007FF6969ACDB4+1024020]\n", + "\tMicrosoft::Applications::Events::EventProperty::~EventProperty [0x00007FF696749A04+57108]\n", + "\tMicrosoft::Applications::Events::EventProperty::empty [0x00007FF6969ABDB3+1019923]\n", + "\tMicrosoft::Applications::Events::EventProperty::empty [0x00007FF6969ABA19+1019001]\n", + "\tMicrosoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6967CE031+394913]\n", + "\tMicrosoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6967CA954+380868]\n", + "\tMicrosoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6967CAA99+381193]\n", + "\tMicrosoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6967C2296+346374]\n", + "\tBaseThreadInitThunk [0x00007FF8A22F1FD7+23]\n", + "\tRtlUserThreadStart [0x00007FF8A2C5D7D0+32]\n", + "\n", + "Progress saved: 10 job categories processed.\n", + "Processing job: Youth Minister in category: Religion and Spirituality\n", + "Progress saved: 11 job categories processed.\n", + "Processing job: Caseworker or Case Manager in category: Social and Human Services\n", + "Progress saved: 12 job categories processed.\n", + "Processing job: Activities Director in category: Sport, Recreation, and Fitness\n", + "Progress saved: 13 job categories processed.\n", + "Processing job: Computer Programmer in category: Technology\n", + "Progress saved: 14 job categories processed.\n", + "Processing job: Occupational Therapist Assistant in category: Therapy\n", + "Progress saved: 15 job categories processed.\n", + "Processing job: Actor in category: Other\n", + "Progress saved: 16 job categories processed.\n" + ] + } + ], + "source": [ + "import re\n", + "import json\n", + "from docx import Document\n", + "from docx.oxml.ns import qn\n", + "from selenium import webdriver\n", + "from selenium.webdriver.edge.service import Service\n", + "from selenium.webdriver.common.by import By\n", + "from webdriver_manager.microsoft import EdgeChromiumDriverManager\n", + "from bs4 import BeautifulSoup\n", + "import time\n", + "\n", + "# Function to check if run is highlighted and get the highlight color\n", + "def get_highlight_color(run):\n", + " highlight_elements = run.element.xpath('.//w:highlight')\n", + " if highlight_elements:\n", + " highlight_element = highlight_elements[0]\n", + " color = highlight_element.get(qn('w:val'))\n", + " return color\n", + " return None\n", + "\n", + "def read_docx(file_path, start_line, end_line):\n", + " doc = Document(file_path)\n", + " lines = []\n", + " line_count = 0\n", + " for paragraph in doc.paragraphs:\n", + " if start_line <= line_count < end_line:\n", + " para_text = paragraph.text.strip()\n", + " highlight_color = None\n", + " for run in paragraph.runs:\n", + " color = get_highlight_color(run)\n", + " if color:\n", + " highlight_color = color # Capture the first highlighted color in the paragraph\n", + " break\n", + " lines.append((para_text, highlight_color)) # Store line and highlight color as a tuple\n", + " line_count += 1\n", + " return lines\n", + "\n", + "def parse_lines(lines):\n", + " url_pattern = re.compile(r'https?://\\S+')\n", + " youtube_pattern = re.compile(r'(https?://(?:www\\.)?youtube\\.com/watch\\?v=[\\w-]+|https?://(?:www\\.)?youtu\\.be/[\\w-]+)')\n", + " data = []\n", + " current_main_category = None\n", + " current_job = None\n", + " jobs = {}\n", + "\n", + " for line, highlight_color in lines:\n", + " youtube_match = youtube_pattern.search(line)\n", + " if youtube_match:\n", + " # Extract the YouTube URL\n", + " youtube_url = youtube_match.group()\n", + " video_id = re.search(r'(?:v=|youtu\\.be/)([\\w-]+)', youtube_url).group(1)\n", + " \n", + " # Store the video ID instead of the iframe HTML\n", + " if current_job and current_job in jobs:\n", + " jobs[current_job]['videos'].append({\n", + " 'video_id': video_id, # Store only the video ID\n", + " 'url': youtube_url # Optionally store the original URL\n", + " })\n", + " elif url_pattern.search(line):\n", + " # Extract the URL\n", + " url_match = url_pattern.search(line)\n", + " url = url_match.group()\n", + " \n", + " # Extract the category (everything before the URL)\n", + " category = line[:url_match.start()].strip()\n", + "\n", + " if current_job and current_job in jobs:\n", + " jobs[current_job]['links'].append({\n", + " 'url': url,\n", + " 'category': category\n", + " })\n", + " elif line.lower().startswith(\"undefined\"):\n", + " # Skip lines starting with \"undefined\"\n", + " continue\n", + " else:\n", + " if current_main_category is None:\n", + " # Set the main category\n", + " current_main_category = line\n", + " elif current_job is None:\n", + " # Set the job title (sub_category)\n", + " current_job = line\n", + " jobs[current_job] = {\n", + " 'links': [],\n", + " 'videos': [],\n", + " 'degree_required': \"Bachelor's\" if highlight_color == 'cyan' else \"Graduate's\" if highlight_color == 'green' else ''\n", + " }\n", + " else:\n", + " # Handle a new main category if a new line appears\n", + " if line.strip() == \"\":\n", + " if current_main_category:\n", + " # Save the current main category and its jobs\n", + " data.append({\n", + " 'main_category': current_main_category,\n", + " 'jobs': jobs\n", + " })\n", + " # Reset for the next main category\n", + " current_main_category = None\n", + " jobs = {}\n", + " current_job = None\n", + " else:\n", + " # If it's neither a URL nor an empty line, it might be a new job\n", + " if current_job:\n", + " # Make sure to add the current job to jobs before changing\n", + " current_job = line\n", + " jobs[current_job] = {\n", + " 'links': [],\n", + " 'videos': [],\n", + " 'degree_required': \"Bachelor's\" if highlight_color == 'cyan' else \"Graduate's\" if highlight_color == 'green' else ''\n", + " }\n", + "\n", + " # Handle the last main category and jobs if they exist\n", + " if current_main_category and jobs:\n", + " data.append({\n", + " 'main_category': current_main_category,\n", + " 'jobs': jobs\n", + " })\n", + "\n", + " return data\n", + "\n", + "def scrape_quick_facts(driver, url):\n", + " driver.get(url) # Open the URL\n", + " time.sleep(5) # Allow time for the page to load\n", + "\n", + " try:\n", + " # Locate the Quick Facts table by its ID\n", + " quickfacts_table = driver.find_element(By.ID, 'quickfacts')\n", + " \n", + " # Get the outer HTML of the table\n", + " quickfacts_html = quickfacts_table.get_attribute('outerHTML')\n", + " \n", + " # Parse the HTML using BeautifulSoup\n", + " soup = BeautifulSoup(quickfacts_html, 'html.parser')\n", + " \n", + " # Remove all links\n", + " for link in soup.find_all('a'):\n", + " link.unwrap() # Remove the link but keep the text\n", + " \n", + " # Remove specific rows\n", + " rows_to_remove = [\"Work Experience in a Related Occupation\", \"On-the-job Training\"]\n", + " for row in soup.find_all('tr'):\n", + " header = row.find('th')\n", + " if header and header.get_text(strip=True) in rows_to_remove:\n", + " row.decompose() # Remove the row entirely\n", + " \n", + " # Modify the first row to have a single cell that spans all columns\n", + " first_row = soup.find('tr')\n", + " first_cell = soup.new_tag('th')\n", + " first_cell['colspan'] = '2' # Set to span all columns\n", + " first_cell.string = 'Quick Facts Table' # Change this to your desired title\n", + " first_row.clear() # Clear existing cells in the first row\n", + " first_row.append(first_cell) # Add the new cell\n", + " \n", + " # Add the source link at the bottom of the table\n", + " source_row = soup.new_tag('tr')\n", + " source_header = soup.new_tag('th')\n", + " source_header['colspan'] = '2' # Span across two columns\n", + " source_header.string = 'Source: '\n", + " \n", + " # Create the source link\n", + " source_link = soup.new_tag('a', href=url)\n", + " source_link.string = 'www.bls.gov'\n", + " source_header.append(source_link)\n", + " \n", + " source_row.append(source_header)\n", + " soup.find('tbody').append(source_row) # Add the source row to the table\n", + "\n", + " # Get the updated HTML without links and excluded rows\n", + " updated_quickfacts_html = str(soup)\n", + "\n", + " return updated_quickfacts_html # Return the HTML for further processing\n", + "\n", + " except Exception as e:\n", + " print(\"Error: Unable to find the Quick Facts table. Exception:\", e)\n", + " return None\n", + "\n", + "def main(file_path, start_line, end_line, output_file):\n", + " lines = read_docx(file_path, start_line, end_line)\n", + " job_data = parse_lines(lines) # Parse the job data from the Word document\n", + " combined_data = []\n", + "\n", + " # Set up the Edge WebDriver\n", + " options = webdriver.EdgeOptions()\n", + " driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)\n", + "\n", + " # Nested function to process each job and scrape Quick Facts\n", + " def process_job(job_category):\n", + " jobs = job_category['jobs']\n", + " for job_title, job_info in jobs.items():\n", + " print(f\"Processing job: {job_title} in category: {job_category['main_category']}\")\n", + "\n", + " # Check if any of the job links is from bls.gov\n", + " bls_links = [link['url'] for link in job_info['links'] if 'bls.gov' in link['url']]\n", + " if bls_links:\n", + " # Scrape the Quick Facts table for this job\n", + " url = bls_links[0] # Use the first bls.gov link found\n", + " html_table = scrape_quick_facts(driver, url) # Scrape the HTML table\n", + " if html_table:\n", + " job_info['jobs_table'] = html_table # Add the HTML table to the job info\n", + "\n", + " return job_category # Return the job category after processing\n", + "\n", + " # Loop through each job entry\n", + " for job_category in job_data:\n", + " processed_job_category = process_job(job_category) # Process job and scrape Quick Facts\n", + " combined_data.append(processed_job_category) # Add the processed job category to combined data\n", + "\n", + " # Save the combined data to a JSON file incrementally\n", + " with open(output_file, 'w') as f:\n", + " json.dump(combined_data, f, indent=4)\n", + " print(f\"Progress saved: {len(combined_data)} job categories processed.\")\n", + "\n", + " driver.quit() # Close the browser\n", + "\n", + "# Specify the file path and line range\n", + "file_path = 'careers.docx'\n", + "start_line = 171 # Starting line (inclusive)\n", + "end_line = start_line + 3000 # Adjust the ending line as needed\n", + "output_file = 'jobs.json'\n", + "\n", + "main(file_path, start_line, end_line, output_file)\n" + ] } ], "metadata": {