From 1c4539b9df1e5369899d346e990606954312fd3c Mon Sep 17 00:00:00 2001 From: Brian Chapman Date: Wed, 15 Nov 2017 14:17:06 -0700 Subject: [PATCH] added m14 inclass materials --- .../InClass/recognizing_phrases.ipynb | 605 ++++++++++++++++++ ...orking_with_mimic2_radiology_reports.ipynb | 373 +++++++++++ .../ROADMAP.ipynb | 2 +- 3 files changed, 979 insertions(+), 1 deletion(-) create mode 100644 modules/m14_linear_algebra_text_processing/InClass/recognizing_phrases.ipynb create mode 100644 modules/m14_linear_algebra_text_processing/InClass/working_with_mimic2_radiology_reports.ipynb diff --git a/modules/m14_linear_algebra_text_processing/InClass/recognizing_phrases.ipynb b/modules/m14_linear_algebra_text_processing/InClass/recognizing_phrases.ipynb new file mode 100644 index 0000000..27fc194 --- /dev/null +++ b/modules/m14_linear_algebra_text_processing/InClass/recognizing_phrases.ipynb @@ -0,0 +1,605 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Phrases\n", + "\n", + "So far we have only thought in terms of single words: \"lower\", \"lobe\", \"University\", \"of\", \"Utah\". But in reality often times multiple words form one unit of thought: \"University of Utah\". Our word vectors will do a better job of representing our text if we fist recognize these phrases. We are going to use the [gensim](https://radimrehurek.com/gensim/models/phrases.html) package to detect and transform these phrases.\n", + "\n", + "For example, the sentence, \"I am a faculty member in the departments of Biomedical Informatics and Radiology and Imaging Sciences at the University of Utah.\" would be transformed to \"I am a faculty member in the departments of Biomedical_Informatics and Radiology_and_Imaging_Sciences at the University_of_Utah.\"\n", + "\n", + "\"Biomedical_Informatics is an example of a **bigram phrase** and \"University_of_Utah\" is a **trigram phrase**. I guess \"Radiology_and_Imaging_Sciences\" is a quadgram phrase, but we will likely not try to detect phrases that long." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using the Gensim Phrases Module" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nose.tools import assert_almost_equal, assert_true, assert_equal, assert_raises\n", + "from numbers import Number" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upgrade to the latest version of gensim" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!conda install gensim -y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pymysql\n", + "import pandas as pd\n", + "import getpass\n", + "from textblob import TextBlob\n", + "import re\n", + "from gensim.models.phrases import Phraser, Phrases\n", + "from IPython.display import clear_output, display, HTML\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gensim\n", + "gensim.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn = pymysql.connect(host=\"mysql\",\n", + " port=3306,user=\"jovyan\",\n", + " passwd=getpass.getpass(\"Enter MySQL passwd for jovyan\"),db='mimic2')\n", + "cursor = conn.cursor()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Select Some Text from the MIMIC2 Database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rad_data = \\\n", + "pd.read_sql(\"\"\"SELECT noteevents.subject_id, \n", + " noteevents.hadm_id,\n", + " noteevents.text \n", + " FROM noteevents\n", + " WHERE noteevents.category = 'RADIOLOGY_REPORT' LIMIT 5000\"\"\",conn)\n", + "rad_data.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rad_data.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Regular expressions for data cleansing\n", + "\n", + "* Write a regular expression to replace dates in the reports with ``[**DATE**]``\n", + "* Write a regular expression to replace times in the reports with ``[**TIME**]``\n", + "* Write a regular expression to replaces digits with \"d\", (e.g. \"43 cc\" would become \"dd cm\")\n", + "\n", + "#### Hints, etc.\n", + "\n", + "* Look at some sample reports to see what dates and times look like in the reports\n", + "* What order would you need to apply the regular expressions?\n", + "* Could we just replace use the digit recognizer and skip the date and time strippers?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rd = re.compile(r\"\"\"\\d\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write a function to pre-process our text\n", + "\n", + "* Lower case?\n", + "* Digits?\n", + "* Strip dates/times?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### But first, write unit tests to test whether `preprocess` is functioning correctly\n", + "#### Then write functionality to pass tests\n", + "\n", + "You might want to use the `strings` module" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import string\n", + "string.ascii_uppercase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess(txt):\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert_true???" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert_equal???" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert_raises???" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a TextBlob from all the text in `rad_data[\"text\"]`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "blob = TextBlob(preprocess(\" \".join(rad_data[\"text\"])))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Write a function `train_phrases` that will train bigram and trigram detectors\n", + "\n", + "* We want to be able to ignore common terms in our phrase detection\n", + "* We want to be able to specify the minimum number of occurences in our text to be considered a phrase\n", + "* Return a dictionary of detectors" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write unit tests to determine whether `train_phrases` is working as expected" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def train_phrases(blob, common_terms=None, min_count=5):\n", + " pass\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "common_terms = [\"of\", \"with\", \"without\", \"and\", \"or\", \"the\", \"a\"]\n", + "generators = train_phrases(blob, common_terms=common_terms, min_count=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write a function that takes a `TextBlob` instance and phrase generators and returns a string of text\n", + "#### Unit tests first" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_phrased_text(blob, generators):\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(found_phrases)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What phrases did we detect?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "found_phrases = set([w for w in phrased_txt.split() if \"_\" in w])\n", + "print(len(found_phrases))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How often did each phrase occur?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import ???" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "counted_phrases = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def sorted_counter(cntr):\n", + " lcntr = list(cntr.items())\n", + " lcntr.sort(key=lambda x:x[1], reverse=True)\n", + " return lcntr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lcounted_phrases = sorted_counter(counted_phrases)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for phrase, count in lcounted_phrases:\n", + " print(\"%s\\t%03d\"%(phrase.ljust(40),count))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a word vector vocabulary using only words and phrases that occur more than N times\n", + "### How to choose N?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What is our vocabulary from phrased_txt (how many unqiue words)?\n", + "\n", + "Why use `TextBlob.words` instead of just `phrased_txt.split()`?\n", + "\n", + "#### why is `phrased_blob = TextBlob(phrased_txt)` fast and `print(len(set(phrased_blob.words)))` slow?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "phrased_blob = TextBlob(phrased_txt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(len(set(phrased_blob.words)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "phrased_blob_count = None\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "phrased_blob_count[:100]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Based on these most frequent words, create a list of stop words to drop from our vocabulary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stop_words = []" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are our infrequent words?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "phrased_blob_count[-2000:-1000]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.distplot([c[1] for c in phrased_blob_count if c[1] > 500])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len([w for w in phrased_blob_count if w[1]>10])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vwords = [w for w in phrased_blob_count if w[1]>0 and w[0] not in stop_words]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vocabulary = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Determining Similarity Between Reports\n", + "* CXR vs CT vs MR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rad_data[rad_data[\"text\"].str.contains(\"MRI\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a Report Browser" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_reports = rad_data.shape[0]\n", + "while True:\n", + " try:\n", + " i = int(input(\"Enter a number between 0 and %d. otherwise to quit\"%num_reports))\n", + " clear_output()\n", + "\n", + " if i < 0 or i >=num_reports:\n", + " break\n", + " txt = TextBlob(rd.sub(\"\"\"d\"\"\", rad_data.iloc[i]['text'].strip().lower()))\n", + " display(HTML(\"<>%s

\"%\" \".join(trigram_generator[bigram_generator[txt.tokens]])))\n", + " \n", + " except ValueError:\n", + " break\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "type(txt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wrangling Doesn't Always Do What You Want\n", + "\n", + ">technique : multiplanar_td and td-weighted_images of the brain with gadolinium_according to standard departmental protocol ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/modules/m14_linear_algebra_text_processing/InClass/working_with_mimic2_radiology_reports.ipynb b/modules/m14_linear_algebra_text_processing/InClass/working_with_mimic2_radiology_reports.ipynb new file mode 100644 index 0000000..7c5c017 --- /dev/null +++ b/modules/m14_linear_algebra_text_processing/InClass/working_with_mimic2_radiology_reports.ipynb @@ -0,0 +1,373 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pymysql\n", + "import pandas as pd\n", + "import getpass\n", + "from textblob import TextBlob\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conn = pymysql.connect(host=\"mysql\",\n", + " port=3306,user=\"jovyan\",\n", + " passwd=getpass.getpass(\"Enter MySQL passwd for jovyan\"),db='mimic2')\n", + "cursor = conn.cursor()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use Pandas and SQL to create a dataframe with the following:\n", + "* subject_id\n", + "* hospital admission id\n", + "* text of the radiology report\n", + "* Limit the number of reports to 10000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rad_data = \\\n", + "pd.read_sql(\"\"\"SELECT noteevents.subject_id, \n", + " noteevents.hadm_id,\n", + " noteevents.text \n", + " FROM noteevents\n", + " WHERE noteevents.category = 'RADIOLOGY_REPORT' LIMIT 10000\"\"\",conn)\n", + "rad_data.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rad_data.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write a function that returns the impression section of a report\n", + "\n", + "#### Hints\n", + "\n", + "* Not every report will have an impression section\n", + "* \"INTERPRETATION\" and \"CONCLUSIONS\" might be synonyms for \"IMPRESSION\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_impression(report):\n", + " pass\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### In how many reports did we find an impression section?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def count_impression(report):\n", + " pass\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sum([count_impression(report) for report in rad_data[\"text\"]])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Regular expressions for data cleansing\n", + "\n", + "* Write a regular expression to replace dates in the reports with ``[**DATE**]``\n", + "* Write a regular expression to replace times in the reports with ``[**TIME**]``\n", + "* Write a regular expression to replaces digits with \"d\", (e.g. \"43 cc\" would become \"dd cm\")\n", + "\n", + "#### Hints\n", + "\n", + "* Look at some sample reports to see what dates and times look like in the reports\n", + "* What order would you need to apply the regular expressions?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(10):\n", + " print(rad_data.iloc[i]['text'])\n", + " print(\"*\"*42,\"\\n\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "date = re.compile(r\"\"\"d\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(rad_data.iloc[0][\"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "date.findall(rad_data.iloc[0][\"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "digits = re.compile(r\"\"\"\\d\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(digits.sub(\"d\", rad_data.iloc[0][\"text\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rad_data[\"impression\"] = \\\n", + "rad_data.apply(lambda row: digits.sub(\"d\", get_impression(row[\"text\"])).lower(), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rad_data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How many unique words occur in the corpus?\n", + "\n", + "#### Hint\n", + "\n", + "1. Use TextBlob\n", + "1. Put all the reports into a single string\n", + "\n", + "#### I got 8658" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "unique_impression_words = None\n", + "len(unique_impression_words)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gensim.parsing.preprocessing import STOPWORDS\n", + "STOPWORDS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "my_stop_words = frozenset([\"a\", \"am\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"for\", \"is\", \"the\", \"is\", \"of\", \"which\", ])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a single string with all the reports\n", + "\n", + "#### Hints, etc.\n", + "* Use List Comprehension\n", + "* Use string joins\n", + "* Iterate over the rows of the data frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define a vector space for the radiology corpus\n", + "\n", + "#### Hints\n", + "\n", + "1. How would you build a corpus from words only occuring more than N times?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a new column named `\"impression no stops\"` where [stop words](https://en.wikipedia.org/wiki/Stop_words) have been dropped from the impression\n", + "\n", + "* The gensim package has stop words defined (``from gensim.parsing.preprocessing import STOPWORDS``\n", + "\n", + "#### Hints\n", + "1. Do you agree with dropping all the stop words?\n", + "1. How could we create a new stopwords frozen set absent the terms we wan't to keep (double negative?)\n", + "1. You could use a regular expressions substitution or token the report first and operate on the list of words." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rad_data[\"impression no stops\"] = \\\n", + "rad_data.apply(None, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rad_data.iloc[0][\"impression\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rad_data.iloc[0][\"impression no stops\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "unique_impression_words = set(TextBlob(\" \".join(rad_data[\"impression no stops\"])).words)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "word_map = dict(zip(unique_impression_words,range(len(unique_impression_words))))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/modules/m14_linear_algebra_text_processing/ROADMAP.ipynb b/modules/m14_linear_algebra_text_processing/ROADMAP.ipynb index fcb1e10..97e6f6a 100644 --- a/modules/m14_linear_algebra_text_processing/ROADMAP.ipynb +++ b/modules/m14_linear_algebra_text_processing/ROADMAP.ipynb @@ -24,7 +24,7 @@ "\n", "* [Working with text reports](./InClass/working_with_mimic2_radiology_reports.ipynb)\n", "* [Identifying phrases](./InClass/recognizing_phrases.ipynb)\n", - "* [Vector spaces with Gensim](Corpora_and_Vector_Spaces.ipynb)" + "* Using nbconvert and [pep8](https://pypi.python.org/pypi/pep8)" ] }, {