From 1c4539b9df1e5369899d346e990606954312fd3c Mon Sep 17 00:00:00 2001
From: Brian Chapman <brian.chapman@utah.edu>
Date: Wed, 15 Nov 2017 14:17:06 -0700
Subject: [PATCH] added m14 inclass materials

---
 .../InClass/recognizing_phrases.ipynb         | 605 ++++++++++++++++++
 ...orking_with_mimic2_radiology_reports.ipynb | 373 +++++++++++
 .../ROADMAP.ipynb                             |   2 +-
 3 files changed, 979 insertions(+), 1 deletion(-)
 create mode 100644 modules/m14_linear_algebra_text_processing/InClass/recognizing_phrases.ipynb
 create mode 100644 modules/m14_linear_algebra_text_processing/InClass/working_with_mimic2_radiology_reports.ipynb

diff --git a/modules/m14_linear_algebra_text_processing/InClass/recognizing_phrases.ipynb b/modules/m14_linear_algebra_text_processing/InClass/recognizing_phrases.ipynb
new file mode 100644
index 0000000..27fc194
--- /dev/null
+++ b/modules/m14_linear_algebra_text_processing/InClass/recognizing_phrases.ipynb
@@ -0,0 +1,605 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Phrases\n",
+    "\n",
+    "So far we have only thought in terms of single words: \"lower\", \"lobe\", \"University\", \"of\", \"Utah\". But in reality often times multiple words form one unit of thought: \"University of Utah\". Our word vectors will do a better job of representing our text if we fist recognize these phrases. We are going to use the [gensim](https://radimrehurek.com/gensim/models/phrases.html) package to detect and transform these phrases.\n",
+    "\n",
+    "For example, the sentence, \"I am a faculty member in the departments of Biomedical Informatics and Radiology and Imaging Sciences at the University of Utah.\" would be transformed to \"I am a faculty member in the departments of Biomedical_Informatics and Radiology_and_Imaging_Sciences at the University_of_Utah.\"\n",
+    "\n",
+    "\"Biomedical_Informatics is an example of a **bigram phrase** and \"University_of_Utah\" is a **trigram phrase**. I guess \"Radiology_and_Imaging_Sciences\" is a quadgram phrase, but we will likely not try to detect phrases that long."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using the Gensim Phrases Module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nose.tools import assert_almost_equal, assert_true, assert_equal, assert_raises\n",
+    "from numbers import Number"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Upgrade to the latest version of gensim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!conda install gensim -y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pymysql\n",
+    "import pandas as pd\n",
+    "import getpass\n",
+    "from textblob import TextBlob\n",
+    "import re\n",
+    "from gensim.models.phrases import Phraser, Phrases\n",
+    "from IPython.display import clear_output, display, HTML\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gensim\n",
+    "gensim.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conn = pymysql.connect(host=\"mysql\",\n",
+    "                       port=3306,user=\"jovyan\",\n",
+    "                       passwd=getpass.getpass(\"Enter MySQL passwd for jovyan\"),db='mimic2')\n",
+    "cursor = conn.cursor()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Select Some Text from the MIMIC2 Database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rad_data = \\\n",
+    "pd.read_sql(\"\"\"SELECT noteevents.subject_id, \n",
+    "                      noteevents.hadm_id,\n",
+    "                      noteevents.text \n",
+    "               FROM noteevents\n",
+    "               WHERE noteevents.category = 'RADIOLOGY_REPORT' LIMIT 5000\"\"\",conn)\n",
+    "rad_data.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rad_data.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define Regular expressions for data cleansing\n",
+    "\n",
+    "* Write a regular expression to replace dates in the reports with ``[**DATE**]``\n",
+    "* Write a regular expression to replace times in the reports with ``[**TIME**]``\n",
+    "* Write a regular expression to replaces digits with \"d\", (e.g. \"43 cc\" would become \"dd cm\")\n",
+    "\n",
+    "#### Hints, etc.\n",
+    "\n",
+    "* Look at some sample reports to see what dates and times look like in the reports\n",
+    "* What order would you need to apply the regular expressions?\n",
+    "* Could we just replace use the digit recognizer and skip the date and time strippers?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rd = re.compile(r\"\"\"\\d\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Write a function to pre-process our text\n",
+    "\n",
+    "* Lower case?\n",
+    "* Digits?\n",
+    "* Strip dates/times?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### But first, write unit tests to test whether `preprocess` is functioning correctly\n",
+    "#### Then write functionality to pass tests\n",
+    "\n",
+    "You might want to use the `strings` module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import string\n",
+    "string.ascii_uppercase"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess(txt):\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert_true???"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert_equal???"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert_raises???"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a TextBlob from all the text in `rad_data[\"text\"]`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "blob = TextBlob(preprocess(\" \".join(rad_data[\"text\"])))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Write a function `train_phrases` that will train bigram and trigram detectors\n",
+    "\n",
+    "* We want to be able to ignore common terms in our phrase detection\n",
+    "* We want to be able to specify the minimum number of occurences in our text to be considered a phrase\n",
+    "* Return a dictionary of detectors"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Write unit tests to determine whether `train_phrases` is working as expected"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_phrases(blob, common_terms=None, min_count=5):\n",
+    "    pass\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "common_terms = [\"of\", \"with\", \"without\", \"and\", \"or\", \"the\", \"a\"]\n",
+    "generators = train_phrases(blob, common_terms=common_terms, min_count=5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Write a function that takes a `TextBlob` instance and phrase generators and returns a string of text\n",
+    "#### Unit tests first"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_phrased_text(blob, generators):\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(found_phrases)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## What phrases did we detect?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "found_phrases = set([w for w in phrased_txt.split() if \"_\" in w])\n",
+    "print(len(found_phrases))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### How often did each phrase occur?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import ???"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "counted_phrases = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sorted_counter(cntr):\n",
+    "    lcntr = list(cntr.items())\n",
+    "    lcntr.sort(key=lambda x:x[1], reverse=True)\n",
+    "    return lcntr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lcounted_phrases = sorted_counter(counted_phrases)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for phrase, count in lcounted_phrases:\n",
+    "    print(\"%s\\t%03d\"%(phrase.ljust(40),count))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a word vector vocabulary using only words and phrases that occur more than N times\n",
+    "### How to choose N?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### What is our vocabulary from phrased_txt (how many unqiue words)?\n",
+    "\n",
+    "Why use `TextBlob.words` instead of just `phrased_txt.split()`?\n",
+    "\n",
+    "#### why is `phrased_blob = TextBlob(phrased_txt)` fast and `print(len(set(phrased_blob.words)))` slow?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "phrased_blob = TextBlob(phrased_txt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(len(set(phrased_blob.words)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "phrased_blob_count = None\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "phrased_blob_count[:100]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Based on these most frequent words, create a list of stop words to drop from our vocabulary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stop_words = []"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### What are our infrequent words?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "phrased_blob_count[-2000:-1000]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.distplot([c[1] for c in phrased_blob_count if c[1] > 500])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len([w for w in phrased_blob_count if w[1]>10])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vwords = [w for w in phrased_blob_count if w[1]>0 and w[0] not in stop_words]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocabulary = {}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Determining Similarity Between Reports\n",
+    "* CXR vs CT vs MR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rad_data[rad_data[\"text\"].str.contains(\"MRI\")]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a Report Browser"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_reports = rad_data.shape[0]\n",
+    "while True:\n",
+    "    try:\n",
+    "        i = int(input(\"Enter a number between 0 and %d. otherwise to quit\"%num_reports))\n",
+    "        clear_output()\n",
+    "\n",
+    "        if i < 0 or i >=num_reports:\n",
+    "            break\n",
+    "        txt = TextBlob(rd.sub(\"\"\"d\"\"\", rad_data.iloc[i]['text'].strip().lower()))\n",
+    "        display(HTML(\"<>%s</p>\"%\" \".join(trigram_generator[bigram_generator[txt.tokens]])))\n",
+    "        \n",
+    "    except ValueError:\n",
+    "        break\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type(txt)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Wrangling Doesn't Always Do What You Want\n",
+    "\n",
+    ">technique : multiplanar_td and td-weighted_images of the brain with gadolinium_according to standard departmental protocol ."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/modules/m14_linear_algebra_text_processing/InClass/working_with_mimic2_radiology_reports.ipynb b/modules/m14_linear_algebra_text_processing/InClass/working_with_mimic2_radiology_reports.ipynb
new file mode 100644
index 0000000..7c5c017
--- /dev/null
+++ b/modules/m14_linear_algebra_text_processing/InClass/working_with_mimic2_radiology_reports.ipynb
@@ -0,0 +1,373 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pymysql\n",
+    "import pandas as pd\n",
+    "import getpass\n",
+    "from textblob import TextBlob\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conn = pymysql.connect(host=\"mysql\",\n",
+    "                       port=3306,user=\"jovyan\",\n",
+    "                       passwd=getpass.getpass(\"Enter MySQL passwd for jovyan\"),db='mimic2')\n",
+    "cursor = conn.cursor()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Use Pandas and SQL to create a dataframe with the following:\n",
+    "* subject_id\n",
+    "* hospital admission id\n",
+    "* text of the radiology report\n",
+    "* Limit the number of reports to 10000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rad_data = \\\n",
+    "pd.read_sql(\"\"\"SELECT noteevents.subject_id, \n",
+    "                      noteevents.hadm_id,\n",
+    "                      noteevents.text \n",
+    "               FROM noteevents\n",
+    "               WHERE noteevents.category = 'RADIOLOGY_REPORT' LIMIT 10000\"\"\",conn)\n",
+    "rad_data.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rad_data.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Write a function that returns the impression section of a report\n",
+    "\n",
+    "#### Hints\n",
+    "\n",
+    "* Not every report will have an impression section\n",
+    "* \"INTERPRETATION\" and \"CONCLUSIONS\" might be synonyms for \"IMPRESSION\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_impression(report):\n",
+    "    pass\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### In how many reports did we find an impression section?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def count_impression(report):\n",
+    "    pass\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sum([count_impression(report) for report in rad_data[\"text\"]])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define Regular expressions for data cleansing\n",
+    "\n",
+    "* Write a regular expression to replace dates in the reports with ``[**DATE**]``\n",
+    "* Write a regular expression to replace times in the reports with ``[**TIME**]``\n",
+    "* Write a regular expression to replaces digits with \"d\", (e.g. \"43 cc\" would become \"dd cm\")\n",
+    "\n",
+    "#### Hints\n",
+    "\n",
+    "* Look at some sample reports to see what dates and times look like in the reports\n",
+    "* What order would you need to apply the regular expressions?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in range(10):\n",
+    "    print(rad_data.iloc[i]['text'])\n",
+    "    print(\"*\"*42,\"\\n\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "date = re.compile(r\"\"\"d\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(rad_data.iloc[0][\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "date.findall(rad_data.iloc[0][\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "digits = re.compile(r\"\"\"\\d\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(digits.sub(\"d\", rad_data.iloc[0][\"text\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rad_data[\"impression\"] = \\\n",
+    "rad_data.apply(lambda row: digits.sub(\"d\", get_impression(row[\"text\"])).lower(), axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rad_data.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### How many unique words occur in the corpus?\n",
+    "\n",
+    "#### Hint\n",
+    "\n",
+    "1. Use TextBlob\n",
+    "1. Put all the reports into a single string\n",
+    "\n",
+    "#### I got 8658"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unique_impression_words = None\n",
+    "len(unique_impression_words)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gensim.parsing.preprocessing import STOPWORDS\n",
+    "STOPWORDS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "my_stop_words = frozenset([\"a\", \"am\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"for\", \"is\", \"the\", \"is\", \"of\", \"which\", ])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a single string with all the reports\n",
+    "\n",
+    "#### Hints, etc.\n",
+    "* Use List Comprehension\n",
+    "* Use string joins\n",
+    "* Iterate over the rows of the data frame"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define a vector space for the radiology corpus\n",
+    "\n",
+    "#### Hints\n",
+    "\n",
+    "1. How would you build a corpus from words only occuring more than N times?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create a new column named `\"impression no stops\"` where [stop words](https://en.wikipedia.org/wiki/Stop_words) have been dropped from the impression\n",
+    "\n",
+    "* The gensim package has stop words defined (``from gensim.parsing.preprocessing import STOPWORDS``\n",
+    "\n",
+    "#### Hints\n",
+    "1. Do you agree with dropping all the stop words?\n",
+    "1. How could we create a new stopwords frozen set absent the terms we wan't to keep (double negative?)\n",
+    "1. You could use a regular expressions substitution or token the report first and operate on the list of words."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rad_data[\"impression no stops\"] = \\\n",
+    "rad_data.apply(None, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rad_data.iloc[0][\"impression\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rad_data.iloc[0][\"impression no stops\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unique_impression_words = set(TextBlob(\" \".join(rad_data[\"impression no stops\"])).words)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_map = dict(zip(unique_impression_words,range(len(unique_impression_words))))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/modules/m14_linear_algebra_text_processing/ROADMAP.ipynb b/modules/m14_linear_algebra_text_processing/ROADMAP.ipynb
index fcb1e10..97e6f6a 100644
--- a/modules/m14_linear_algebra_text_processing/ROADMAP.ipynb
+++ b/modules/m14_linear_algebra_text_processing/ROADMAP.ipynb
@@ -24,7 +24,7 @@
     "\n",
     "* [Working with text reports](./InClass/working_with_mimic2_radiology_reports.ipynb)\n",
     "* [Identifying phrases](./InClass/recognizing_phrases.ipynb)\n",
-    "* [Vector spaces with Gensim](Corpora_and_Vector_Spaces.ipynb)"
+    "* Using nbconvert and [pep8](https://pypi.python.org/pypi/pep8)"
    ]
   },
   {