StampyAI · mruwnik · Aug 7, 2023 · Aug 1, 2023 · Aug 3, 2023 · Aug 3, 2023
diff --git a/.github/workflows/fetch-dataset.yml b/.github/workflows/fetch-dataset.yml
@@ -29,7 +29,6 @@ on:
         options:
           - agentmodels
           - aiimpacts
-          - aipulse
           - aisafety.camp
           - aisafety.info
           - ai_alignment_playlist
@@ -40,6 +39,7 @@ on:
           - alignmentforum
           - alignment_newsletter
           - arbital
+          - arxiv
           - carado.moe
           - cold_takes
           - deepmind_blog
@@ -49,7 +49,6 @@ on:
           - ebooks
           - eleuther.ai
           - gdocs
-          - gdrive_ebooks
           - generative.ink
           - gwern_blog
           - html_articles
@@ -59,14 +58,12 @@ on:
           - markdown
           - miri
           - ml_safety_newsletter
-          - nonarxiv_papers
-          - qualiacomputing
           - openai.research
           - pdfs
-          - reports
           - rob_miles_ai_safety
           - vkrakovna_blog
           - yudkowsky_blog
+          - xmls
 
 jobs:
   build-dataset:
@@ -81,10 +78,17 @@ jobs:
       with:
         python-version: '3.x'
 
+    - name: Install Pandoc
+      run: |
+        if [ "${{ inputs.datasource }}" = "gdocs" ]; then
+          sudo apt-get update
+          sudo apt-get -y install pandoc
+        fi
+
     - name: Install dependencies
       run: pip install -r requirements.txt
 
-    - name: Generate dataset file
+    - name: Process dataset
       env:
         CODA_TOKEN: ${{ secrets.CODA_TOKEN || inputs.coda_token }}
         YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY || inputs.youtube_api_key }}

diff --git a/.github/workflows/fetch-weekly.yml b/.github/workflows/fetch-weekly.yml
@@ -11,7 +11,6 @@ jobs:
         datasource:
           - agentmodels
           - aiimpacts
-          - aipulse
           - aisafety.camp
           - ai_alignment_playlist
           - ai_explained
@@ -30,7 +29,6 @@ jobs:
           - ebooks
           - eleuther.ai
           - gdocs
-          - gdrive_ebooks
           - generative.ink
           - gwern_blog
           - html_articles
@@ -39,14 +37,12 @@ jobs:
           - markdown
           - miri
           - ml_safety_newsletter
-          - nonarxiv_papers
-          - qualiacomputing
           - openai.research
           - pdfs
-          - reports
           - rob_miles_ai_safety
           - vkrakovna_blog
           - yudkowsky_blog
+          - xmls
 
     uses: ./.github/workflows/fetch-dataset.yml
     with:

diff --git a/.github/workflows/push-datasets.yml → .github/workflows/push-dataset.yml b/.github/workflows/push-datasets.yml → .github/workflows/push-dataset.yml
@@ -1,17 +1,32 @@
 name: Synch uploaded jsonl files to HuggingFace
 
 on:
+  workflow_call:
+    inputs:
+      datasource:
+        type: string
+        required: true
+      coda_token:
+        type: string
+        required: true
+      db_user:
+        type: string
+        required: true
+      db_password:
+        type: string
+        required: true
+      db_host:
+        type: string
+        required: true
   workflow_dispatch: # allow manual triggering
     inputs:
       datasource:
         description: 'The datasource to process'
         type: choice
         default: all
         options:
-          - all
           - agentmodels
           - aiimpacts
-          - aipulse
           - aisafety.camp
           - aisafety.info
           - ai_alignment_playlist
@@ -20,7 +35,6 @@ on:
           - ai_safety_reading_group
           - ai_tech_tu_delft
           - alignmentforum
-          - alignment_newsletter
           - arbital
           - arxiv
           - carado.moe
@@ -29,10 +43,8 @@ on:
           - deepmind_technical_blog
           - distill
           - eaforum
-          - ebooks
           - eleuther.ai
           - gdocs
-          - gdrive_ebooks
           - generative.ink
           - gwern_blog
           - html_articles
@@ -42,14 +54,12 @@ on:
           - markdown
           - miri
           - ml_safety_newsletter
-          - nonarxiv_papers
-          - qualiacomputing
           - openai.research
           - pdfs
-          - reports
           - rob_miles_ai_safety
           - vkrakovna_blog
           - yudkowsky_blog
+          - xmls
 
 jobs:
   generate-dataset:
@@ -69,11 +79,11 @@ jobs:
 
     - name: Generate dataset file
       env:
-        CODA_TOKEN: ${{ secrets.CODA_TOKEN }}
-        YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
-        ARD_DB_USER: ${{ secrets.ARD_DB_USER }}
-        ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD }}
-        ARD_DB_HOST: ${{ secrets.ARD_DB_HOST }}
+        CODA_TOKEN: ${{ secrets.CODA_TOKEN || inputs.coda_token }}
+        YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY || inputs.youtube_api_key }}
+        ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }}
+        ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }}
+        ARD_DB_HOST: ${{ secrets.ARD_DB_HOST || inputs.db_host }}
         ARD_DB_NAME: alignment_research_dataset
       run: python main.py generate_jsonl_files ${{ inputs.datasource }}
 

diff --git a/.github/workflows/upload-to-huggingface.yml b/.github/workflows/upload-to-huggingface.yml
@@ -0,0 +1,56 @@
+name: Upload datasets to Huggingface
+on:
+  workflow_dispatch: # allow manual triggering
+  schedule:
+    - cron: "0 3 * * 0"  # Every Sunday at 3 AM
+
+jobs:
+  update_dateset:
+    strategy:
+      matrix:
+        datasource:
+          - agentmodels
+          - aiimpacts
+          - aisafety.camp
+          - aisafety.info
+          - ai_alignment_playlist
+          - ai_explained
+          - ai_safety_talks
+          - ai_safety_reading_group
+          - ai_tech_tu_delft
+          - alignmentforum
+          - arbital
+          - arxiv
+          - carado.moe
+          - cold_takes
+          - deepmind_blog
+          - deepmind_technical_blog
+          - distill
+          - eaforum
+          - eleuther.ai
+          - gdocs
+          - generative.ink
+          - gwern_blog
+          - html_articles
+          - importai
+          - jsteinhardt_blog
+          - lesswrong
+          - markdown
+          - miri
+          - ml_safety_newsletter
+          - openai.research
+          - pdfs
+          - rob_miles_ai_safety
+          - vkrakovna_blog
+          - yudkowsky_blog
+          - xmls
+
+    uses: ./.github/workflows/push-dataset.yml
+    with:
+      datasource: ${{ matrix.datasource }}
+      coda_token: ${{ inputs.coda_token }}
+      youtube_api_key: ${{ inputs.youtube_api_key }}
+      db_user: ${{ inputs.db_user }}
+      db_password: ${{ inputs.db_password }}
+      db_host: ${{ inputs.db_host }}
+    secrets: inherit
diff --git a/README.md b/README.md
@@ -10,32 +10,40 @@ The following list of sources may change and items may be renamed:
 - [aiimpacts](https://aiimpacts.org/)
 - [aisafety.camp](https://aisafety.camp/)
 - [aisafety.info](https://aisafety.info/)
+- [ai_alignment_playlist]()
+- [ai_explained](https://www.youtube.com/@ai-explained-)
+- [ai_safety_talks](https://www.youtube.com/@aisafetytalks)
+- [ai_safety_reading_group](https://www.youtube.com/@aisafetyreadinggroup/videos)
+- [ai_tech_tu_delft](https://www.youtube.com/@AiTechTUDelft/)
 - [alignmentforum](https://www.alignmentforum.org)
 - [alignment_newsletter](https://rohinshah.com/alignment-newsletter/)
 - [arbital](https://arbital.com/)
 - arxiv - alignment research papers from [arxiv](https://arxiv.org/)
-- audio_transcripts - transcripts from interviews with various researchers and other audio recordings
 - [carado.moe](https://carado.moe/)
 - [cold_takes](https://www.cold-takes.com/)
 - [deepmind_blog](https://deepmindsafetyresearch.medium.com/)
+- [deepmind_technical_blog](https://www.deepmind.com/blog-categories/technical-blogs)
 - [distill](https://distill.pub/)
 - [eaforum](https://forum.effectivealtruism.org/) - selected posts
-- gdocs
-- gdrive_ebooks - books include [Superintelligence](https://www.goodreads.com/book/show/20527133-superintelligence), [Human Compatible](https://www.goodreads.com/book/show/44767248-human-compatible), [Life 3.0](https://www.goodreads.com/book/show/34272565-life-3-0), [The Precipice](https://www.goodreads.com/book/show/50485582-the-precipice), and others
+- [eleuther.ai](https://blog.eleuther.ai/)
 - [generative.ink](https://generative.ink/posts/)
 - [gwern_blog](https://gwern.net/)
+- gdocs - various doc files stored on Google drive
+- html_articles - various articles on websites
 - [import.ai](https://importai.substack.com)
 - [jsteinhardt_blog](https://jsteinhardt.wordpress.com/)
 - [lesswrong](https://www.lesswrong.com/) - selected posts
-- markdown.ebooks
+- markdown
 - [miri](https://intelligence.org/) - MIRI
 - [ml_safety_newsletter](https://newsletter.mlsafety.org)
-- nonarxiv_papers - other alignment research papers
-- [qualiacomputing](https://qualiacomputing.com/)
-- reports
+- [openai.research](https://openai.com/research)
+- pdfs - various pdfs from different places
+- [rob_miles_ai_safety](https://www.youtube.com/@RobertMilesAI)
 - [vkrakovna_blog](https://vkrakovna.wordpress.com)
 - [waitbutwhy](https://waitbutwhy.com/)
 - [yudkowsky_blog](https://www.yudkowsky.net/)
+- xmls - various articles stored as XML files
+
 
 ## Keys
 

diff --git a/align_data/__init__.py b/align_data/__init__.py
@@ -3,7 +3,6 @@
 import align_data.sources.blogs as blogs
 import align_data.sources.ebooks as ebooks
 import align_data.sources.arxiv_papers as arxiv_papers
-import align_data.sources.reports as reports
 import align_data.sources.greaterwrong as greaterwrong
 import align_data.sources.stampy as stampy
 import align_data.sources.alignment_newsletter as alignment_newsletter
@@ -16,7 +15,6 @@
     + blogs.BLOG_REGISTRY
     + ebooks.EBOOK_REGISTRY
     + arxiv_papers.ARXIV_REGISTRY
-    + reports.REPORT_REGISTRY
     + greaterwrong.GREATERWRONG_REGISTRY
     + stampy.STAMPY_REGISTRY
     + distill.DISTILL_REGISTRY