From ac983fc8bb6066f17c4e4ebfad9266e2612c5b65 Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Fri, 4 Aug 2023 12:19:21 +0200 Subject: [PATCH] PR changes --- .github/workflows/fetch-dataset.yml | 3 +-- .github/workflows/fetch-weekly.yml | 3 +-- .github/workflows/push-dataset.yml | 3 +-- .github/workflows/upload-to-huggingface.yml | 3 +-- README.md | 22 +++++++++++++------ align_data/sources/articles/__init__.py | 2 +- .../sources/arxiv_papers/arxiv_papers.py | 2 +- align_data/sources/blogs/__init__.py | 1 - tests/align_data/test_arxiv.py | 3 +-- 9 files changed, 22 insertions(+), 20 deletions(-) diff --git a/.github/workflows/fetch-dataset.yml b/.github/workflows/fetch-dataset.yml index d9b17bae..788e24fc 100644 --- a/.github/workflows/fetch-dataset.yml +++ b/.github/workflows/fetch-dataset.yml @@ -58,13 +58,12 @@ on: - markdown - miri - ml_safety_newsletter - - nonarxiv_papers - - qualiacomputing - openai.research - pdfs - rob_miles_ai_safety - vkrakovna_blog - yudkowsky_blog + - xmls jobs: build-dataset: diff --git a/.github/workflows/fetch-weekly.yml b/.github/workflows/fetch-weekly.yml index 49fe7597..4af1fa26 100644 --- a/.github/workflows/fetch-weekly.yml +++ b/.github/workflows/fetch-weekly.yml @@ -37,13 +37,12 @@ jobs: - markdown - miri - ml_safety_newsletter - - nonarxiv_papers - - qualiacomputing - openai.research - pdfs - rob_miles_ai_safety - vkrakovna_blog - yudkowsky_blog + - xmls uses: ./.github/workflows/fetch-dataset.yml with: diff --git a/.github/workflows/push-dataset.yml b/.github/workflows/push-dataset.yml index 7108df65..768cf2a6 100644 --- a/.github/workflows/push-dataset.yml +++ b/.github/workflows/push-dataset.yml @@ -54,13 +54,12 @@ on: - markdown - miri - ml_safety_newsletter - - nonarxiv_papers - - qualiacomputing - openai.research - pdfs - rob_miles_ai_safety - vkrakovna_blog - yudkowsky_blog + - xmls jobs: generate-dataset: diff --git a/.github/workflows/upload-to-huggingface.yml b/.github/workflows/upload-to-huggingface.yml index 958353b4..eaac2ceb 100644 --- a/.github/workflows/upload-to-huggingface.yml +++ b/.github/workflows/upload-to-huggingface.yml @@ -38,13 +38,12 @@ jobs: - markdown - miri - ml_safety_newsletter - - nonarxiv_papers - - qualiacomputing - openai.research - pdfs - rob_miles_ai_safety - vkrakovna_blog - yudkowsky_blog + - xmls uses: ./.github/workflows/push-dataset.yml with: diff --git a/README.md b/README.md index c8166946..3e820519 100644 --- a/README.md +++ b/README.md @@ -10,32 +10,40 @@ The following list of sources may change and items may be renamed: - [aiimpacts](https://aiimpacts.org/) - [aisafety.camp](https://aisafety.camp/) - [aisafety.info](https://aisafety.info/) +- [ai_alignment_playlist]() +- [ai_explained](https://www.youtube.com/@ai-explained-) +- [ai_safety_talks](https://www.youtube.com/@aisafetytalks) +- [ai_safety_reading_group](https://www.youtube.com/@aisafetyreadinggroup/videos) +- [ai_tech_tu_delft](https://www.youtube.com/@AiTechTUDelft/) - [alignmentforum](https://www.alignmentforum.org) - [alignment_newsletter](https://rohinshah.com/alignment-newsletter/) - [arbital](https://arbital.com/) - arxiv - alignment research papers from [arxiv](https://arxiv.org/) -- audio_transcripts - transcripts from interviews with various researchers and other audio recordings - [carado.moe](https://carado.moe/) - [cold_takes](https://www.cold-takes.com/) - [deepmind_blog](https://deepmindsafetyresearch.medium.com/) +- [deepmind_technical_blog](https://www.deepmind.com/blog-categories/technical-blogs) - [distill](https://distill.pub/) - [eaforum](https://forum.effectivealtruism.org/) - selected posts -- ebooks - books include [Superintelligence](https://www.goodreads.com/book/show/20527133-superintelligence), [Human Compatible](https://www.goodreads.com/book/show/44767248-human-compatible), [Life 3.0](https://www.goodreads.com/book/show/34272565-life-3-0), [The Precipice](https://www.goodreads.com/book/show/50485582-the-precipice), and others -- gdocs +- [eleuther.ai](https://blog.eleuther.ai/) - [generative.ink](https://generative.ink/posts/) - [gwern_blog](https://gwern.net/) +- gdocs - various doc files stored on Google drive +- html_articles - various articles on websites - [import.ai](https://importai.substack.com) - [jsteinhardt_blog](https://jsteinhardt.wordpress.com/) - [lesswrong](https://www.lesswrong.com/) - selected posts -- markdown.ebooks +- markdown - [miri](https://intelligence.org/) - MIRI - [ml_safety_newsletter](https://newsletter.mlsafety.org) -- nonarxiv_papers - other alignment research papers -- [qualiacomputing](https://qualiacomputing.com/) -- reports +- [openai.research](https://openai.com/research) +- pdfs - various pdfs from different places +- [rob_miles_ai_safety](https://www.youtube.com/@RobertMilesAI) - [vkrakovna_blog](https://vkrakovna.wordpress.com) - [waitbutwhy](https://waitbutwhy.com/) - [yudkowsky_blog](https://www.yudkowsky.net/) +- xmls - various articles stored as XML files + ## Keys diff --git a/align_data/sources/articles/__init__.py b/align_data/sources/articles/__init__.py index a6fff663..6775e496 100644 --- a/align_data/sources/articles/__init__.py +++ b/align_data/sources/articles/__init__.py @@ -19,7 +19,7 @@ sheet_id='1800487220' ), XMLArticles( - name='nonarxiv_papers', + name='xmls', spreadsheet_id='1l3azVJVukGAvZPgg0GyeqiaQe8bEMZvycBJaA8cRXf4', sheet_id='823056509' ), diff --git a/align_data/sources/arxiv_papers/arxiv_papers.py b/align_data/sources/arxiv_papers/arxiv_papers.py index d4eef69f..ae9b7cb9 100644 --- a/align_data/sources/arxiv_papers/arxiv_papers.py +++ b/align_data/sources/arxiv_papers/arxiv_papers.py @@ -62,7 +62,7 @@ def process_entry(self, item) -> None: "authors": authors, "date_published": self._get_published_date(self.is_val(item.date_published) or paper.get('date_published')), "data_last_modified": str(metadata.updated), - "abstract": metadata.summary.replace("\n", " "), + "summary": metadata.summary.replace("\n", " "), "author_comment": metadata.comment, "journal_ref": metadata.journal_ref, "doi": metadata.doi, diff --git a/align_data/sources/blogs/__init__.py b/align_data/sources/blogs/__init__.py index 8f1d5fc1..7021c994 100644 --- a/align_data/sources/blogs/__init__.py +++ b/align_data/sources/blogs/__init__.py @@ -12,7 +12,6 @@ WordpressBlog(name="aisafety.camp", url="https://aisafety.camp"), WordpressBlog(name="miri", url="https://intelligence.org"), WordpressBlog(name="jsteinhardt_blog", url="https://jsteinhardt.wordpress.com"), - WordpressBlog(name="qualiacomputing", url="https://qualiacomputing.com"), WordpressBlog(name="vkrakovna_blog", url="https://vkrakovna.wordpress.com"), WordpressBlog(name="yudkowsky_blog", url="https://yudkowsky.net"), MediumBlog(name="deepmind_blog", url="https://deepmindsafetyresearch.medium.com/", authors=["DeepMind Safety Research"]), diff --git a/tests/align_data/test_arxiv.py b/tests/align_data/test_arxiv.py index 00b07969..30717d9e 100644 --- a/tests/align_data/test_arxiv.py +++ b/tests/align_data/test_arxiv.py @@ -44,7 +44,6 @@ def test_process_entry(): with patch('align_data.arxiv_papers.arxiv_papers.parse_vanity', return_value=contents): with patch('align_data.arxiv_papers.arxiv_papers.arxiv', arxiv): assert dataset.process_entry(item).to_dict() == { - 'abstract': 'abstract bla bla', 'author_comment': 'no comment', 'authors': ['mr blobby'], 'categories': 'wut', @@ -56,7 +55,7 @@ def test_process_entry(): 'primary_category': 'cat', 'source': 'asd', 'source_type': 'html', - 'summaries': [], + 'summaries': ['abstract bla bla'], 'text': 'this is the text', 'title': 'this is the title', 'url': 'https://arxiv.org/abs/2001.11038',