swivel/eval.mk

# -*- Mode: Makefile -*-
#
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This makefile pulls down the evaluation datasets and formats them uniformly.
# Word similarity evaluations are formatted to contain exactly three columns:
# the two words being compared and the human judgement.
#
# Use wordsim.py and analogy to run the actual evaluations.

CXXFLAGS=-std=c++11 -m64 -mavx -g -Ofast -Wall
LDLIBS=-lpthread -lm

WORDSIM_EVALS=	ws353sim.ws.tab \
		ws353rel.ws.tab \
		men.ws.tab	\
		mturk.ws.tab \
		rarewords.ws.tab \
		simlex999.ws.tab \
		$(NULL)

ANALOGY_EVALS=	mikolov.an.tab \
		msr.an.tab \
		$(NULL)

all: $(WORDSIM_EVALS) $(ANALOGY_EVALS) analogy

ws353sim.ws.tab: ws353simrel.tar.gz
	tar Oxfz $^ wordsim353_sim_rel/wordsim_similarity_goldstandard.txt > $@

ws353rel.ws.tab: ws353simrel.tar.gz
	tar Oxfz $^ wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt > $@

men.ws.tab: MEN.tar.gz
	tar Oxfz $^ MEN/MEN_dataset_natural_form_full | tr ' ' '\t' > $@

mturk.ws.tab: Mtruk.csv
	cat $^ | tr -d '\r' | tr ',' '\t' > $@

rarewords.ws.tab: rw.zip
	unzip -p $^ rw/rw.txt | cut -f1-3 -d $$'\t' > $@

simlex999.ws.tab: SimLex-999.zip
	unzip -p $^ SimLex-999/SimLex-999.txt \
	| tail -n +2 | cut -f1,2,4 -d $$'\t' > $@

mikolov.an.tab: questions-words.txt
	egrep -v -E '^:' $^ | tr '[A-Z] ' '[a-z]\t' > $@

msr.an.tab: myz_naacl13_test_set.tgz
	tar Oxfz $^ test_set/word_relationship.questions | tr ' ' '\t' > /tmp/q
	tar Oxfz $^ test_set/word_relationship.answers | cut -f2 -d ' ' > /tmp/a
	paste /tmp/q /tmp/a > $@
	rm -f /tmp/q /tmp/a


# wget commands to fetch the datasets.  Please see the original datasets for
# appropriate references if you use these.
ws353simrel.tar.gz:
	wget http://alfonseca.org/pubs/ws353simrel.tar.gz

MEN.tar.gz:
	wget http://clic.cimec.unitn.it/~elia.bruni/resources/MEN.tar.gz

Mtruk.csv:
	wget http://tx.technion.ac.il/~kirar/files/Mtruk.csv

rw.zip:
	wget http://www-nlp.stanford.edu/~lmthang/morphoNLM/rw.zip

SimLex-999.zip:
	wget http://www.cl.cam.ac.uk/~fh295/SimLex-999.zip

questions-words.txt:
	wget http://word2vec.googlecode.com/svn/trunk/questions-words.txt

myz_naacl13_test_set.tgz:
	wget http://research.microsoft.com/en-us/um/people/gzweig/Pubs/myz_naacl13_test_set.tgz

analogy: analogy.cc

clean:
	rm -f *.ws.tab *.an.tab analogy *.pyc

distclean: clean
	rm -f *.tgz *.tar.gz *.zip Mtruk.csv questions-words.txt