Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed Wikipedia scraping in T011 #425

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@
"outputs": [],
"source": [
"from pathlib import Path\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"HERE = Path(_dh[-1])\n",
"DATA = HERE / \"data\"\n",
Expand Down Expand Up @@ -195,7 +197,7 @@
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mHTTPError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[3], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m bad_url \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhttps://raw.githubusercontent.com/openkinome/kinodata/master/data/KinHubKinaseList.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 3\u001b[0m bad_response \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mget(bad_url)\n\u001b[0;32m----> 4\u001b[0m \u001b[43mbad_response\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m bad_response\n",
"File \u001b[0;32m~/.miniconda3/envs/teachopencadd/lib/python3.9/site-packages/requests/models.py:1021\u001b[0m, in \u001b[0;36mResponse.raise_for_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1016\u001b[0m http_error_msg \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 1017\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m Server Error: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mreason\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for url: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1018\u001b[0m )\n\u001b[1;32m 1020\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m http_error_msg:\n\u001b[0;32m-> 1021\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m HTTPError(http_error_msg, response\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m)\n",
"File \u001b[0;32m~/Documents/coding/py_env/lib/python3.12/site-packages/requests/models.py:1021\u001b[0m, in \u001b[0;36mResponse.raise_for_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1016\u001b[0m http_error_msg \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 1017\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m Server Error: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mreason\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for url: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1018\u001b[0m )\n\u001b[1;32m 1020\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m http_error_msg:\n\u001b[0;32m-> 1021\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m HTTPError(http_error_msg, response\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m)\n",
"\u001b[0;31mHTTPError\u001b[0m: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/openkinome/kinodata/master/data/KinHubKinaseList.txt"
]
}
Expand Down Expand Up @@ -537,7 +539,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"<!doctype html><html lang=\"en\"><head><meta charset=\"utf-8\"/><title>UniProt</title><meta name=\"viewport\" content=\"width=device-width,initial-scale=1\"/><meta name=\"theme-color\" content=\"#00639a\"/><link rel=\"apple-touch-icon\" sizes=\"180x180\" href=\"/apple-touch-icon.png\"/><link rel=\"shortcut icon\" type=\"image/png\" sizes=\"144x144\" href=\"/mstile-144x144.png\"/><link rel=\"icon\" type=\"image/png\" sizes=\"32x32\" href=\"/favicon-32x32.png\"/><link rel=\"icon\" type=\"image/png\" sizes=\"16x16\" href=\"/favicon-16x16.png\"/><link rel=\"manifest\" href=\"/manifest.json\"/><link rel=\"preconnect\" href=\"https://fonts.gstatic.com/\"/><link rel=\"preconnect\" href=\"https://rest.uniprot.org/\"/><link rel=\"preload\" as=\"style\" href=\"https://fonts.googleapis.com/css?family=Lato:400,700|Source+Sans+Pro:600,700&display=swap\"/><script async src=\"https://www.googletagmanager.com/gtag/js?id=G-V6TXEC4BDF\"></script><script>window.dataLayer = window.dataLayer || [];\n",
"<!doctype html><html lang=\"en\"><head><meta charset=\"utf-8\"/><title>UniProt</title><meta name=\"viewport\" content=\"width=device-width,initial-scale=1\"/><meta name=\"theme-color\" content=\"#00639a\"/><link rel=\"preconnect\" href=\"https://rest.uniprot.org/\"/><link rel=\"preconnect\" href=\"https://fonts.googleapis.com\"/><link rel=\"preconnect\" href=\"https://fonts.gstatic.com\" crossorigin/><link href=\"https://fonts.googleapis.com/css?family=Lato:400,700|Source+Sans+Pro:600,700&display=optional\" rel=\"stylesheet\"/><link rel=\"search\" type=\"application/opensearchdescription+xml\" href=\"/opensearch.xml\" title=\"UniProt\"><link rel=\"apple-touch-icon\" sizes=\"180x180\" href=\"/apple-touch-icon.png\"/><link rel=\"shortcut icon\" type=\"image/png\" sizes=\"144x144\" href=\"/mstile-144x144.png\"/><link rel=\"icon\" type=\"image/png\" sizes=\"32x32\" href=\"/favicon-32x32.png\"/><link rel=\"icon\" type=\"image/png\" sizes=\"16x16\" href=\"/favicon-16x16.png\"/><link rel=\"manifest\" href=\"/manifest.json\"/><script async src=\"https://www.googletagmanager.com/gtag/js?id=G-V6TXEC4BDF\"></script><script>window.dataLayer = window.dataLayer || [];\n",
" function gtag() {\n",
" dataLayer.push(arguments);\n",
" }\n",
Expand Down Expand Up @@ -602,7 +604,7 @@
" to {\n",
" opacity: 1;\n",
" }\n",
" }</style><link rel=\"stylesheet\" href=\"/styles.6f2cd3.css\"/><link rel=\"stylesheet\" href=\"/styles.8a7ea9.css\"/></head><body><div id=\"root\"><div class=\"fallback\" data-nosnippet><div><h1>UniProt website fallback message</h1>If you are not seeing anything on this page, it might be for multiple reasons:<ul><li><strong>You might have JavaScript disabled</strong>: make sure to enable JavaScript on your browser, or use a browser that supports JavaScript.</li><li><strong>You might have an outdated browser</strong>: make sure that your browser is up to date as older versions might not work with the website.</li><li><strong>There might have been a network issue</strong>: ensure that your connectivity is stable and try to reload the page to see if it solves the issue. <button onclick=\"location.reload();\">Reload this page</button></li></ul></div></div></div><script nomodule>// workaround for Safari 10.1 supporting module but ignoring nomodule\n",
" }</style><link rel=\"stylesheet\" href=\"/styles.e2d996.css\"/><link rel=\"stylesheet\" href=\"/styles.cc9b63.css\"/></head><body><div id=\"root\"><div class=\"fallback\" data-nosnippet><div><h1>UniProt website fallback message</h1>If you are not seeing anything on this page, it might be for multiple reasons:<ul><li><strong>You might have JavaScript disabled</strong>: make sure to enable JavaScript on your browser, or use a browser that supports JavaScript.</li><li><strong>You might have an outdated browser</strong>: make sure that your browser is up to date as older versions might not work with the website.</li><li><strong>There might have been a network issue</strong>: ensure that your connectivity is stable and try to reload the page to see if it solves the issue. <button onclick=\"location.reload();\">Reload this page</button></li></ul></div></div></div><script nomodule>// workaround for Safari 10.1 supporting module but ignoring nomodule\n",
" // From https://gist.github.com/samthor/64b114e4a4f539915a95b91ffd340acc\n",
" (function () {\n",
" var d = document;\n",
Expand All @@ -627,7 +629,7 @@
" d.head.appendChild(c);\n",
" c.remove();\n",
" }\n",
" })();</script><script type=\"module\" src=\"/modern.app.81c288.js\"></script><script type=\"module\" src=\"/modern.app.9062d6.js\"></script><script type=\"module\" src=\"/modern.app.922f2d.js\"></script><script type=\"module\" src=\"/modern.app.846d6d.js\"></script><script type=\"module\" src=\"/modern.app.1dc90f.js\"></script><script nomodule defer=\"defer\" src=\"/legacy.app.21ebe9.js\"></script><script nomodule defer=\"defer\" src=\"/legacy.app.9062d6.js\"></script><script nomodule defer=\"defer\" src=\"/legacy.app.922f2d.js\"></script><script nomodule defer=\"defer\" src=\"/legacy.app.788032.js\"></script><script nomodule defer=\"defer\" src=\"/legac\n"
" })();</script><script type=\"module\" src=\"/modern.app.8d9dc8.js\"></script><script type=\"module\" src=\"/modern.app.608090.js\"></script><script type=\"module\" src=\"/modern.app.f86293.js\"></script><script type=\"module\" src=\"/modern.app.5a44ec.js\"></script><script type=\"module\" src=\"/modern.app.a2ca48.js\"></script><script nomodule defer=\"defer\" src=\"/legacy.app.16ef1e.js\"></script><script nomodule defer=\"defer\" src=\"/legacy.app.608090.js\"></script><script \n"
]
}
],
Expand Down Expand Up @@ -670,7 +672,7 @@
"AC P0DTC2;\n",
"DT 22-APR-2020, integrated into UniProtKB/Swiss-Prot.\n",
"DT 22-APR-2020, sequence version 1.\n",
"DT 03-MAY-2023, entry version 16.\n",
"DT 02-OCT-2024, entry version 24.\n",
"DE RecName: Full=Spike glycoprotein {ECO:0000255|HAMAP-Rule:MF_04099};\n",
"DE Short=S glycoprotein {ECO:0000255|HAMAP-Rule:MF_04099};\n",
"DE AltName: Full=E2 {ECO:0000255|HAMAP-Rule:MF_04099};\n",
Expand Down Expand Up @@ -989,7 +991,7 @@
"Name: sp|P12931|SRC_HUMAN\n",
"Description: sp|P12931|SRC_HUMAN Proto-oncogene tyrosine-protein kinase Src OS=Homo sapiens OX=9606 GN=SRC PE=1 SV=3\n",
"Number of features: 0\n",
"Seq('MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADGHRGPSAAF...ENL', SingleLetterAlphabet())\n"
"Seq('MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADGHRGPSAAF...ENL')\n"
]
}
],
Expand Down Expand Up @@ -1454,7 +1456,7 @@
{
"data": {
"text/plain": [
"<bravado.http_future.HttpFuture at 0x7efddec36b50>"
"<bravado.http_future.HttpFuture at 0x13434e180>"
]
},
"execution_count": 26,
Expand Down Expand Up @@ -1827,9 +1829,8 @@
"# To guess the correct steps here, you will have to inspect the HTML code by hand\n",
"# Tip: use right-click + inspect content in any webpage to land in the HTML definition ;)\n",
"html = BeautifulSoup(r.text)\n",
"header = html.find(\"span\", id=\"General_chemical_properties\")\n",
"table = header.find_all_next()[4]\n",
"table_body = table.find(\"tbody\")\n",
"header = html.find(id=\"General_chemical_properties\")\n",
"table_body = header.find_next(\"tbody\")\n",
"\n",
"data = []\n",
"for row in table_body.find_all(\"tr\"):\n",
Expand Down Expand Up @@ -1911,7 +1912,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.12.6"
},
"toc-autonumbering": true,
"widgets": {
Expand Down