Update outdated code and documentation

- Introduce fail-fast in various places to improve error reporting - Add a docker-compose-no-init.yml for running the project in discrete steps - Consolidate README.md - Update URL for RankyMcRankFace.jar
o19s · Oct 7, 2021 · b891ae6 · b891ae6
1 parent 64c0474
commit b891ae6
Show file tree

Hide file tree

Showing 10 changed files with 59 additions and 136 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,5 +4,5 @@ model.txt
 movie_judgments_wfeatures.*.txt
 movie_judgments_wfeatur*.txt
 tmdb.json
-train/RankyMcRankFace-0.1.1.jar
+train/RankyMcRankFace-*.jar
 train/*.pyc
diff --git a/README.md b/README.md
@@ -2,24 +2,30 @@
 
 This demo uses data from [TheMovieDB](http://themoviedb.org) (TMDB) to demonstrate using [Ranklib](https://sourceforge.net/p/lemur/wiki/RankLib/) learning to rank models with Elasticsearch.
 
-You can go through the individual steps, or if you want to just skip to the end, you can use Docker:
+# Run Everything in One Step
 
 ```
 docker-compose up
 ```
 
-And browse to http://localhost:8000
+If project files have been modified after the initial run, do this to update the docker images:
+```
+docker-compose build
+docker-compose up
+```
 
+And browse to http://localhost:8000
 
-# Install Dependencies and prep data...
+# Run Each Step One by One
 
 This demo requires
 
 - Python 3+
 - Python `elasticsearch` and `requests` libraries
 
+## Install Dependencies
 ```
-pip3 install requests elasticsearch5 parse jinja
+pip3 install requests elasticsearch5 parse jinja2
 ```
 
 ## Download the TMDB Data & Ranklib Jar
@@ -36,7 +42,12 @@ cd train
 Start a supported version of Elasticsearch and follow the [instructions to install](https://github.com/o19s/elasticsearch-learning-to-rank#installing) the learning to rank plugin.
 
 ```
-docker run -d -p 9201:9200 -p 9301:9300 -e "discovery.type=single-node" --name elasticsearch5 elasticsearch:5.6.4
+docker-compose -f docker-compose-no-init.yml up
+```
+## Populate Ratings
+
+```
+python /train/ratingsToES.py http://elasticsearch:9200
 ```
 
 ## Index to Elasticsearch

diff --git a/docker-compose-no-init.yml b/docker-compose-no-init.yml
@@ -0,0 +1,23 @@
+# Docker compose file for the application.
+
+#version: '2'   # on CircleCI the version and services cause build to puke.
+#services:
+
+  app:
+    build: .
+    dockerfile: ./deploy/app/Dockerfile
+    environment:
+     - ELASTICSEARCH_URL=http://localhost:9200
+    links:
+     - elasticsearch
+    ports:
+     - "8000:80"
+
+  elasticsearch:
+    build: .
+    dockerfile: ./deploy/elasticsearch/Dockerfile
+    environment:
+      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
+    ports:
+     - "9200:9200"
+     - "9300:9300"
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -11,7 +11,7 @@
     links:
      - elasticsearch
     ports:
-     - "80:80"
+     - "8000:80"
 
   elasticsearch:
     build: .

diff --git a/train/README.md b/train/README.md
diff --git a/train/loadFeatures.py b/train/loadFeatures.py
@@ -34,7 +34,9 @@ def loadFeatures(esHost, featureSetName='movie_features'):
     fullPath = urljoin(esHost, path)
     print("POST %s" % fullPath)
     print(json.dumps(featureSet, indent=2))
-    resp = requests.post(fullPath, json.dumps(featureSet))
+    resp = requests.post(fullPath, json.dumps(featureSet), headers={'Content-Type': 'application/json'})
+    if resp.status_code != 201:
+        raise Exception('Posting to %s is not returning 201, got %s' % (resp.url, resp.status_code))
     print("%s" % resp.status_code)
     print("%s" % resp.text)
 

diff --git a/train/movielens.py b/train/movielens.py
@@ -1,4 +1,3 @@
-
 import json
 from elasticsearch5 import Elasticsearch
 
@@ -51,9 +50,8 @@ def getExpansions(es, mlensIds, minDocCount=1, expandField='liked_movies.keyword
 
 
 def expansionMlens(es, keywords):
-    esMlens = Elasticsearch('http://elasticsearch:9200', timeout=1000)
     topMlens = getTopMlensIds(es, keywords=keywords, searchField="title", index="tmdb")
-    return getExpansions(es=esMlens, mlensIds=topMlens, expandField="liked_movies.keyword", shardSize=10)
+    return getExpansions(es=es, mlensIds=topMlens, expandField="liked_movies.keyword", shardSize=10)
 
 
 if __name__ == "__main__":

diff --git a/train/prepare.sh b/train/prepare.sh
@@ -1,5 +1,5 @@
-#!/bin/bash
-wget https://dl.bintray.com/o19s/RankyMcRankFace/com/o19s/RankyMcRankFace/0.1.1/RankyMcRankFace-0.1.1.jar
+#!/bin/bash -e
+wget https://github.com/o19s/RankyMcRankFace/releases/download/0.1.0/RankyMcRankFace-0.1.0.jar
 wget http://es-learn-to-rank.labs.o19s.com/tmdb.json
 wget http://files.grouplens.org/datasets/movielens/ml-20m.zip
 unzip ml-20m.zip
diff --git a/train/ratingsToES.py b/train/ratingsToES.py
@@ -51,8 +51,12 @@ def indexToElastic(es):
 
 
 if __name__ == "__main__":
-    from sys import argv
-    es_url = argv[1]
+    if len(argv) > 1:
+        es_url = argv[1]
+    else:
+        config = configparser.ConfigParser()
+        config.read('settings.cfg')
+        es_url = config['DEFAULT']['ESHost']
 
     es = Elasticsearch(es_url)
     indexToElastic(es)

diff --git a/train/train.py b/train/train.py
@@ -11,12 +11,13 @@ def trainModel(trainingData, testData, modelOutput, whichModel=8):
     #  - each is trained against a proportion of the training data (-srate)
     #  - each is trained using a subset of the features (-frate)
     #  - each can be either a MART or LambdaMART model (-rtype 6 lambda mart)
-    cmd = "java -jar RankyMcRankFace-0.1.1.jar -metric2t NDCG@10 -bag 10 -srate 0.6 -frate 0.6 -rtype 6 -shrinkage 0.1 -tree 80 -ranker %s -train %s -test %s -save %s -feature features.txt" % (whichModel, trainingData, testData, modelOutput)
+    cmd = "java -jar RankyMcRankFace-0.1.0.jar -metric2t NDCG@10 -bag 10 -srate 0.6 -frate 0.6 -rtype 6 -shrinkage 0.1 -tree 80 -ranker %s -train %s -test %s -save %s -feature features.txt" % (whichModel, trainingData, testData, modelOutput)
     print("*********************************************************************")
     print("*********************************************************************")
     print("Running %s" % cmd)
-    os.system(cmd)
-    pass
+    r = os.system(cmd)
+    if r != 0:
+        raise Exception('Unable to execute command cmd %s' % cmd)
 
 
 def partitionJudgments(judgments, testProportion=0.1):
@@ -54,7 +55,7 @@ def saveModel(esHost, scriptName, featureSet, modelFname):
     path = "_ltr/_clearcache"
     fullPath = urljoin(esHost, path)
     print("POST %s" % fullPath)
-    resp = requests.post(fullPath)
+    resp = requests.post(fullPath, headers={'Content-Type': 'application/json'})
     if (resp.status_code >= 300):
         print(resp.text)
 
@@ -64,7 +65,7 @@ def saveModel(esHost, scriptName, featureSet, modelFname):
         fullPath = urljoin(esHost, path)
         modelPayload['model']['model']['definition'] = modelContent
         print("POST %s" % fullPath)
-        resp = requests.post(fullPath, json.dumps(modelPayload))
+        resp = requests.post(fullPath, json.dumps(modelPayload), headers={'Content-Type': 'application/json'})
         print(resp.status_code)
         if (resp.status_code >= 300):
             print(resp.text)