From b891ae696a86c719a22a605e66ec753b0f03ee71 Mon Sep 17 00:00:00 2001 From: Cuong Tham Date: Wed, 6 Oct 2021 15:12:22 -0700 Subject: [PATCH] Update outdated code and documentation - Introduce fail-fast in various places to improve error reporting - Add a docker-compose-no-init.yml for running the project in discrete steps - Consolidate README.md - Update URL for RankyMcRankFace.jar --- .gitignore | 2 +- README.md | 21 +++++-- docker-compose-no-init.yml | 23 ++++++++ docker-compose.yml | 2 +- train/README.md | 116 ------------------------------------- train/loadFeatures.py | 4 +- train/movielens.py | 4 +- train/prepare.sh | 4 +- train/ratingsToES.py | 8 ++- train/train.py | 11 ++-- 10 files changed, 59 insertions(+), 136 deletions(-) create mode 100644 docker-compose-no-init.yml delete mode 100644 train/README.md diff --git a/.gitignore b/.gitignore index c247c58..d135d5a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,5 +4,5 @@ model.txt movie_judgments_wfeatures.*.txt movie_judgments_wfeatur*.txt tmdb.json -train/RankyMcRankFace-0.1.1.jar +train/RankyMcRankFace-*.jar train/*.pyc diff --git a/README.md b/README.md index 8dd206b..22af558 100644 --- a/README.md +++ b/README.md @@ -2,24 +2,30 @@ This demo uses data from [TheMovieDB](http://themoviedb.org) (TMDB) to demonstrate using [Ranklib](https://sourceforge.net/p/lemur/wiki/RankLib/) learning to rank models with Elasticsearch. -You can go through the individual steps, or if you want to just skip to the end, you can use Docker: +# Run Everything in One Step ``` docker-compose up ``` -And browse to http://localhost:8000 +If project files have been modified after the initial run, do this to update the docker images: +``` +docker-compose build +docker-compose up +``` +And browse to http://localhost:8000 -# Install Dependencies and prep data... +# Run Each Step One by One This demo requires - Python 3+ - Python `elasticsearch` and `requests` libraries +## Install Dependencies ``` -pip3 install requests elasticsearch5 parse jinja +pip3 install requests elasticsearch5 parse jinja2 ``` ## Download the TMDB Data & Ranklib Jar @@ -36,7 +42,12 @@ cd train Start a supported version of Elasticsearch and follow the [instructions to install](https://github.com/o19s/elasticsearch-learning-to-rank#installing) the learning to rank plugin. ``` -docker run -d -p 9201:9200 -p 9301:9300 -e "discovery.type=single-node" --name elasticsearch5 elasticsearch:5.6.4 +docker-compose -f docker-compose-no-init.yml up +``` +## Populate Ratings + +``` +python /train/ratingsToES.py http://elasticsearch:9200 ``` ## Index to Elasticsearch diff --git a/docker-compose-no-init.yml b/docker-compose-no-init.yml new file mode 100644 index 0000000..d8eb834 --- /dev/null +++ b/docker-compose-no-init.yml @@ -0,0 +1,23 @@ +# Docker compose file for the application. + +#version: '2' # on CircleCI the version and services cause build to puke. +#services: + + app: + build: . + dockerfile: ./deploy/app/Dockerfile + environment: + - ELASTICSEARCH_URL=http://localhost:9200 + links: + - elasticsearch + ports: + - "8000:80" + + elasticsearch: + build: . + dockerfile: ./deploy/elasticsearch/Dockerfile + environment: + - "ES_JAVA_OPTS=-Xms512m -Xmx512m" + ports: + - "9200:9200" + - "9300:9300" diff --git a/docker-compose.yml b/docker-compose.yml index 505fa53..096ce87 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,7 +11,7 @@ links: - elasticsearch ports: - - "80:80" + - "8000:80" elasticsearch: build: . diff --git a/train/README.md b/train/README.md deleted file mode 100644 index dec9488..0000000 --- a/train/README.md +++ /dev/null @@ -1,116 +0,0 @@ -# Learning to Rank Demo - -This demo uses data from [TheMovieDB](http://themoviedb.org) (TMDB) to demonstrate using [Ranklib](https://sourceforge.net/p/lemur/wiki/RankLib/) learning to rank models with Elasticsearch. - -# Install Dependencies and prep data... - -This demo requires - -- Python 3+ -- Python `elasticsearch` and `requests` libraries - -## Download the TMDB Data & Ranklib Jar - -The first time you run this demo, fetch RankLib.jar (used to train model) and tmdb.json (the dataset used) - -``` -python prepare.py -``` - -## Start Elasticsearch/install plugin - -Start a supported version of Elasticsearch and follow the [instructions to install](https://github.com/o19s/elasticsearch-learning-to-rank#installing) the learning to rank plugin. - -## Index to Elasticsearch - -This script will create a 'tmdb' index with default/simple mappings. You can edit this file to play with mappings. - -``` -python indexMlTmdb.py -``` - -# Onto the machine learning... - -## TLDR - -If you're actually going to build a learning to rank system, read past this section. But to sum up, the full Movie demo can be run by - -``` -python train.py -``` - -Then you can search using - -``` -python search.py Rambo -``` - -and search results can be printed to the console. - -More on how all this actually works below: - -## Create and upload features (loadFeatures.py) - -A "feature" in ES LTR corresponds to an Elasticsearch query. The score yielded by the query is used to train and evaluate the model. For example, if you feel that a TF\*IDF title score corresponds to higher relevance, then that's a feature you'd want to train on! Other features might include how old a movie is, the number of keywords in a query, or whatever else you suspect might correlate to your user's sense of relevance. - -If you examine [loadFeatures.py](loadFeatures.py) you'll see how we create features. We first initialize the default feature store (`PUT /_ltr`). We create a feature set (`POST /_ltr/_featureset/movie_features`). Now we have a place to create features for both logging & use by our models! - -In the demo features 1...n json are mustache templates that correspond to the features. In this case, the features are identified by *ordinal* (feature 1 is in 1.json). They are uploaded to Elasticsearch Learning to Rank with these ordinals as the feature name. In `eachFeature`, you'll see a loop where we access each mustache template an the file system and return a JSON body for adding the feature to Elasticsearch. - -For traditional Ranklib models, the ordinal is the only way features are identified. Other models use feature *names* which make developing, logging, and managing features more maintainable. - -## Gather Judgments (sample_judgments.txt) - -The first part of the training data is the *judgment list*. We've provided one in [sample_judgments.txt](sample_judgments.txt). - -What's a judgment list? A judgment list tells us how relevant a document is for a search query. In other words, a three-tuple of - -``` -,, -``` - -Quality comes in the form of *grades*. For example if movie "First Blood" is considered extremely relevant for the query Rambo, we give it a grade of 4 ('exactly relevant'). The movie Bambi would receive a '0'. Instead of the notional CSV format above, Ranklib and other learning to rank systems use a format from LibSVM, shown below: - -``` -# qid:1: rambo -# -# -# grade (0-4) queryid # docId title -4 qid:1 # 7555 Rambo -``` - -You'll notice we bastardize this syntax to add comments identifying the keywords associated with each query id, and append metadata to each line. Code provided in [judgments.py](judgments.py) handles this syntax. - -## Log features (collectFeatures.py) - -You saw above how we created features, the next step is to log features for each judgment 3-tuple. This code is in [collectFeatures.py](collectFeatures.py). Logging features can be done in several different contexts. Of course, in a production system, you may wish to log features as users search. In other contexts, you may have a hand-created judgment list (as we do) and wish to simply ask Elasticsearch Learning to Rank for feature values for query/document pairs. - -Is [collectFeatures.py](collectFeatures.py), you'll see an `sltr` query is included. This query points to a featureSet, not a model. So it does not influence the score. We filter down to needed document ids for each keyword and allow this `sltr` query to run. - -You'll also notice an `ext` component in the request. This search extension is part of the Elasticsearch Learning to Rank plugin and allows you to configure feature logging. You'll noticed it refers to the query name of `sltr`, allowing it to pluck out the `sltr` query and perform logging associated with the feature set. - -Once features are gathered, the judgment list is fleshed out with feature value, the ordinals below corresponding to the features in our 1..n.json files. - -``` -4 qid:1 1:12.318446 2:9.8376875 # 7555 rambo -``` - -## Train (train.py and RankLib.jar) - -With training data in place, it's time to ask RankLib to train a model, and output to a test file. RankLib supports linear models, ListNet, and several tree-based models such as LambdaMART. In [train.py](train.py) you'll notice how RankLib is called with command line arguments. Models `test_N` are created in our feature store for each type of RankLib model. In the `saveModel` function, you can see how the model is uploaded to our "movie_features" feature set. - -## Search using the model (search.py) - -See what sort of search results you get! In `search.py` you'll see we execute the `sltr` query referring to a `test_N` model in the rescore phase. By default `test_6` is used (corresponding to LambdaMART), but you can change the used model at the command line. - -Search with default LambdaMART: - -``` -python search.py rambo -``` - -Try a different model: - -``` -python search.py rambo test_8 -``` diff --git a/train/loadFeatures.py b/train/loadFeatures.py index 9165c86..eadde81 100644 --- a/train/loadFeatures.py +++ b/train/loadFeatures.py @@ -34,7 +34,9 @@ def loadFeatures(esHost, featureSetName='movie_features'): fullPath = urljoin(esHost, path) print("POST %s" % fullPath) print(json.dumps(featureSet, indent=2)) - resp = requests.post(fullPath, json.dumps(featureSet)) + resp = requests.post(fullPath, json.dumps(featureSet), headers={'Content-Type': 'application/json'}) + if resp.status_code != 201: + raise Exception('Posting to %s is not returning 201, got %s' % (resp.url, resp.status_code)) print("%s" % resp.status_code) print("%s" % resp.text) diff --git a/train/movielens.py b/train/movielens.py index 1a0a5ff..d0d54bb 100644 --- a/train/movielens.py +++ b/train/movielens.py @@ -1,4 +1,3 @@ - import json from elasticsearch5 import Elasticsearch @@ -51,9 +50,8 @@ def getExpansions(es, mlensIds, minDocCount=1, expandField='liked_movies.keyword def expansionMlens(es, keywords): - esMlens = Elasticsearch('http://elasticsearch:9200', timeout=1000) topMlens = getTopMlensIds(es, keywords=keywords, searchField="title", index="tmdb") - return getExpansions(es=esMlens, mlensIds=topMlens, expandField="liked_movies.keyword", shardSize=10) + return getExpansions(es=es, mlensIds=topMlens, expandField="liked_movies.keyword", shardSize=10) if __name__ == "__main__": diff --git a/train/prepare.sh b/train/prepare.sh index bc14a7b..c6550a3 100755 --- a/train/prepare.sh +++ b/train/prepare.sh @@ -1,5 +1,5 @@ -#!/bin/bash -wget https://dl.bintray.com/o19s/RankyMcRankFace/com/o19s/RankyMcRankFace/0.1.1/RankyMcRankFace-0.1.1.jar +#!/bin/bash -e +wget https://github.com/o19s/RankyMcRankFace/releases/download/0.1.0/RankyMcRankFace-0.1.0.jar wget http://es-learn-to-rank.labs.o19s.com/tmdb.json wget http://files.grouplens.org/datasets/movielens/ml-20m.zip unzip ml-20m.zip diff --git a/train/ratingsToES.py b/train/ratingsToES.py index 3cd622a..0646adb 100644 --- a/train/ratingsToES.py +++ b/train/ratingsToES.py @@ -51,8 +51,12 @@ def indexToElastic(es): if __name__ == "__main__": - from sys import argv - es_url = argv[1] + if len(argv) > 1: + es_url = argv[1] + else: + config = configparser.ConfigParser() + config.read('settings.cfg') + es_url = config['DEFAULT']['ESHost'] es = Elasticsearch(es_url) indexToElastic(es) diff --git a/train/train.py b/train/train.py index 03c8a53..acd088c 100644 --- a/train/train.py +++ b/train/train.py @@ -11,12 +11,13 @@ def trainModel(trainingData, testData, modelOutput, whichModel=8): # - each is trained against a proportion of the training data (-srate) # - each is trained using a subset of the features (-frate) # - each can be either a MART or LambdaMART model (-rtype 6 lambda mart) - cmd = "java -jar RankyMcRankFace-0.1.1.jar -metric2t NDCG@10 -bag 10 -srate 0.6 -frate 0.6 -rtype 6 -shrinkage 0.1 -tree 80 -ranker %s -train %s -test %s -save %s -feature features.txt" % (whichModel, trainingData, testData, modelOutput) + cmd = "java -jar RankyMcRankFace-0.1.0.jar -metric2t NDCG@10 -bag 10 -srate 0.6 -frate 0.6 -rtype 6 -shrinkage 0.1 -tree 80 -ranker %s -train %s -test %s -save %s -feature features.txt" % (whichModel, trainingData, testData, modelOutput) print("*********************************************************************") print("*********************************************************************") print("Running %s" % cmd) - os.system(cmd) - pass + r = os.system(cmd) + if r != 0: + raise Exception('Unable to execute command cmd %s' % cmd) def partitionJudgments(judgments, testProportion=0.1): @@ -54,7 +55,7 @@ def saveModel(esHost, scriptName, featureSet, modelFname): path = "_ltr/_clearcache" fullPath = urljoin(esHost, path) print("POST %s" % fullPath) - resp = requests.post(fullPath) + resp = requests.post(fullPath, headers={'Content-Type': 'application/json'}) if (resp.status_code >= 300): print(resp.text) @@ -64,7 +65,7 @@ def saveModel(esHost, scriptName, featureSet, modelFname): fullPath = urljoin(esHost, path) modelPayload['model']['model']['definition'] = modelContent print("POST %s" % fullPath) - resp = requests.post(fullPath, json.dumps(modelPayload)) + resp = requests.post(fullPath, json.dumps(modelPayload), headers={'Content-Type': 'application/json'}) print(resp.status_code) if (resp.status_code >= 300): print(resp.text)