diff --git a/django_files/atlas/urls.py b/django_files/atlas/urls.py index a06da876..35d173e2 100644 --- a/django_files/atlas/urls.py +++ b/django_files/atlas/urls.py @@ -4,6 +4,7 @@ #from django.views.generic.simple import redirect_to from django.conf.urls import patterns, include from django.views.generic import TemplateView, RedirectView +from django.views.generic.simple import direct_to_template # sitemap from django.conf.urls.defaults import * @@ -125,8 +126,8 @@ def render_to_response(self, context, **kwargs): (r'^api/near/(?P\w{3})/(?P[0-9\.]+)/(?P\d+)/$', 'observatory.views_exhibit.api_near'), - (r'^api/search/$', 'observatory.views.api_search'), - (r'^search/$', 'observatory.views.search'), + (r'^api/search/$', 'observatory.views_search.api_search'), + (r'^search/$', direct_to_template, {'template': 'searchresults.html'}), # Overview (Countries) ###################################################### (r'^country/(?P\w{2,3})/$', 'observatory.views_overview.country2'), @@ -156,6 +157,8 @@ def render_to_response(self, context, **kwargs): url(r'^favicon\.ico$', RedirectView.as_view(url='/media/img/favicon.ico')), url(r'^sitemap\.xml$', RedirectView.as_view(url='/media/sitemaps/sitemap_index.xml')), - #(r'^sitemap\.xml$', 'django.contrib.sitemaps.views.sitemap', {'sitemaps': sitemaps}), + url(r'^opensearch.xml$', direct_to_template, {'template': 'opensearch.xml', + 'mimetype': + 'application/opensearchdescription+xml'}), ) diff --git a/django_files/observatory/helpers.py b/django_files/observatory/helpers.py index 157d4e62..311fa2b9 100644 --- a/django_files/observatory/helpers.py +++ b/django_files/observatory/helpers.py @@ -1,7 +1,5 @@ from observatory.models import Hs4_cpy, Sitc4_cpy, Country, Hs4, Sitc4 -import re - # make sure app name is in the list of possible apps def get_app_name(app_name): @@ -220,22 +218,3 @@ def params_to_url(api_name=None, app_name=None, country_codes=None, url += "%s/" % years return url - -year_expressions = [ - re.compile(r'between (\d{4}) and (\d{4})', re.IGNORECASE), - re.compile(r'from (\d{4}) to (\d{4})', re.IGNORECASE), - re.compile(r'(\d{4}).*(\d{4})'), - re.compile(r'(?:in|at|during) (\d{4})', re.IGNORECASE), - re.compile(r'(\d{4})') -] - - -def extract_years(input_str): - """Extract things that look like years out of a given plaintext.""" - results = (exp.search(input_str) for exp in year_expressions) - results = [result for result in results if result is not None] - - if len(results) == 0: - return None, None - else: - return results[0].span(), results[0].groups() diff --git a/django_files/observatory/management/commands/index_questions.py b/django_files/observatory/management/commands/index_questions.py index aad12e87..964e871d 100644 --- a/django_files/observatory/management/commands/index_questions.py +++ b/django_files/observatory/management/commands/index_questions.py @@ -15,10 +15,17 @@ def handle(self, *args, **options): # TODO: "the" in country names trade_flows = ["import", "export"] - countries_flat = list(Country.objects.get_valid().only('name_en', - 'name_3char')) + countries_flat = list(Country.objects.get_valid() + .only('name_en', + 'name_3char', + 'region__name', + )) countries = [[c] for c in countries_flat] - products = list(Hs4.objects.get_low_level().only('name_en', 'code')) + products = list(Hs4.objects.get_low_level() + .only('name_en', + 'code', + 'community__name', + )) # Which products are feasible for Latvia? casy_questions = self.generate_index_entries(['casy'], ['pie_scatter'], @@ -135,16 +142,22 @@ def generate_index_entry(args): ) index["url"] = url + regions = None + if args[2] is not None: + regions = [c.region.name for c in args[2]] + # Add in params into elasticsearch in case we need them later kwargs = dict( api_name=args[0], app_name=args[1], country_names=country_names, country_codes=country_codes, + regions=regions, trade_flow=args[3], years=args[4], product_name=args[5].name_en if args[5] is not None else None, - product_code=args[5].code if args[5] is not None else None + product_code=args[5].code if args[5] is not None else None, + product_community=args[5].community.name if args[5] is not None else None, ) kwargs = {k: v for k, v in kwargs.iteritems() if v is not None} index.update(kwargs) diff --git a/django_files/observatory/tests.py b/django_files/observatory/tests.py index 501deb77..414aeeb2 100644 --- a/django_files/observatory/tests.py +++ b/django_files/observatory/tests.py @@ -1,16 +1,74 @@ -""" -This file demonstrates writing tests using the unittest module. These will pass -when you run "manage.py test". +from django.utils.unittest import TestCase -Replace this with more appropriate tests for your application. -""" +from observatory.views_search import make_extractor, remove_spans -from django.test import TestCase +class TestRemoveSpans(TestCase): -class SimpleTest(TestCase): - def test_basic_addition(self): - """ - Tests that 1 + 1 always equals 2. - """ - self.assertEqual(1 + 1, 2) + def test_one_span(self): + s = "I am a test string! Hahaaa! Narwhals!" + self.assertEquals("I am a string! Hahaaa! Narwhals!", + remove_spans(s, [(6, 11)])) + + def test_two_spans(self): + s = "I am a test string! Hahaaa! Narwhals!" + self.assertEquals("I am a string! Narwhals!", + remove_spans(s, [(6, 11), (19, 27)])) + + def test_empty_span(self): + s = "I am a test string! Hahaaa! Narwhals!" + self.assertEquals(s, remove_spans(s, [])) + + def test_consecutive_span(self): + s = "I am a test string! Hahaaa! Narwhals!" + self.assertEquals("I am a test string!", + remove_spans(s, [(19, 27), (27, 37)])) + + def test_one_gap_span(self): + s = "I am a test string! Hahaaa! Narwhals!" + self.assertEquals("I am a test string!!", + remove_spans(s, [(19, 26), (27, 37)])) + + def test_beginning_span(self): + s = "I am a test string! Hahaaa! Narwhals!" + self.assertEquals("a test string! Hahaaa! Narwhals!", + remove_spans(s, [(0, 5)])) + + def test_end_span(self): + s = "I am a test string! Hahaaa! Narwhals!" + self.assertEquals("I am a test string! Hahaaa!", + remove_spans(s, [(27, 37)])) + + +class SearchExtractorTest(TestCase): + + def test_one_match_one_group(self): + string = "I have a cat." + extractor = make_extractor(r"(cat)") + result, processed_string = extractor(string) + self.assertEqual(processed_string, "I have a .") + self.assertEqual(result, [(("cat",), ((9, 12),))]) + + def test_two_match_one_group(self): + string = "I have a cat and another cat." + + extractor = make_extractor(r"(cat|dog)") + result, processed_string = extractor(string) + self.assertEqual(processed_string, "I have a and another .") + self.assertEqual(result, [(("cat", ), ((9, 12),)), + (("cat", ), ((25, 28),))]) + + def test_one_match_two_group(self): + string = "I have a cat and another dog." + + extractor = make_extractor(r"(cat).*?(dog)") + result, processed_string = extractor(string) + self.assertEqual(processed_string, "I have a .") + self.assertEqual(result, [(("cat", "dog"), ((9, 28),))]) + + extractor2 = make_extractor(r"(cat).*?(dog)", remove_only_matches=True) + result, processed_string = extractor2(string) + self.assertEqual(processed_string, "I have a and another .") + self.assertEqual(result, [(("cat", "dog"), ((9, 12), (25, 28)))]) + + #TODO: maybe match output and spans together? diff --git a/django_files/observatory/views.py b/django_files/observatory/views.py index 9da91d6a..842ad654 100644 --- a/django_files/observatory/views.py +++ b/django_files/observatory/views.py @@ -18,7 +18,6 @@ import json from django.core import serializers from django.core.urlresolvers import reverse -from elasticsearch import Elasticsearch # Project specific from django.utils.translation import gettext as _ # App specific @@ -2546,63 +2545,3 @@ def get_country_lookup(): lookup[c.id] = [c.name_en, c.name_3char] return lookup -def api_search(request): - - query = request.GET.get("term", None) - if query == None: - return HttpResponse("[]") - - span, years = helpers.extract_years(query) - if span is not None: - # Strip out year expression from query since elasticsearch doesn't - # contain year data - query = query[:span[0]] + query[span[1]:] - - if years is None: - year_string = "" - year_url_param = "" - elif len(years) == 1: - year_string = " (%s)" % years[0] - year_url_param = "%s/" % years[0] - else: - year_string = " (%s to %s)" % (years[0], years[1]) - year_url_param = "%s.%s/" % (years[0], years[1]) - - es = Elasticsearch() - result = es.search( - index="questions", - body={ - "query": { - "filtered": { - "query": { - "fuzzy_like_this": { - "like_text": query, - "fields": ["title"], - "fuzziness": 3, - "max_query_terms": 15, - "prefix_length": 4 - } - } - } - }, - # "highlight": { - # "pre_tags": ["
"], - # "fields": {"title": {}}, - # "post_tags": ["
"] - # }, - "size": 8 - }) - result_list = [] - for x in result['hits']['hits']: - label = x['_source']['title'] + year_string - url = x['_source']['url'] + year_url_param - # TODO: This is a hack, the correct way is to generate the url here - # instead of pregenerating it. See issue # 134 - if years and len(years) > 1: - url = url.replace("tree_map", "stacked") - result_list.append(dict(label=label, value=url)) - return HttpResponse(json.dumps(result_list)) - - -def search(request): - return render_to_response("test_search.html") diff --git a/django_files/observatory/views_search.py b/django_files/observatory/views_search.py new file mode 100644 index 00000000..3de3f4bd --- /dev/null +++ b/django_files/observatory/views_search.py @@ -0,0 +1,295 @@ +from django.conf import settings +from django.http import HttpResponse +from elasticsearch import Elasticsearch + +from collections import defaultdict, OrderedDict +import json +import re + +# These are different from the regions in the DB in that they are a bit more +# generalized. +REGIONS = [ + "europe", + "asia", + "america", + "africa", + "caribbean", + "micronesia", + "melanesia", + "polynesia", + "australia" +] + +# These are different from the product communities in the DB in that the names +# are simplified. +# TODO: maybe put this in the DB as a property? The actual community names are +# too long to be handy to remember +PRODUCT_COMMUNITY = [ + "Animal Products", + "Vegetable products", + "Foodstuffs", + "Mineral Products", + "Chemicals", + "Plastics", + "Leather", + "Wood Products", + "Textiles", + "Footwear", + "Stone", + "Metals", + "Machinery", + "Transportation", + "Service" +] +PRODUCT_COMMUNITY_RE = re.compile("|".join(PRODUCT_COMMUNITY), + re.IGNORECASE) + +REGIONS_RE = re.compile("|".join(REGIONS), re.IGNORECASE) + +API_NAMES = ["casy", "cspy", "csay", "ccsy", "sapy"] +API_NAMES_RE = re.compile("|".join(API_NAMES), re.IGNORECASE) + +TRADE_FLOWS = ["import", "export", "net_import", "net_export"] +TRADE_FLOWS_RE = re.compile("|".join(TRADE_FLOWS), re.IGNORECASE) + +APP_NAMES = ["map", "pie_scatter", "stacked", "product_space", "rings", + "tree_map"] +APP_NAMES_RE = re.compile("|".join(APP_NAMES)) + +PRODUCT_CODE_RE = r"(\d{4})" + +YEAR_EXPRESSIONS = [ + re.compile(r'between (\d{4}) and (\d{4})', re.IGNORECASE), + re.compile(r'from (\d{4}) to (\d{4})', re.IGNORECASE), + re.compile(r'(\d{4}).*(\d{4})'), + re.compile(r'(?:in|at|during) (\d{4})', re.IGNORECASE), + re.compile(r'(\d{4})') +] + + +def extract_years(input_str): + """Extract things that look like years out of a given plaintext.""" + results = (exp.search(input_str) for exp in YEAR_EXPRESSIONS) + results = [result for result in results if result is not None] + + if len(results) == 0: + return None, None + else: + years = results[0].groups() + for year in years: + if not (1995 <= int(year) <= 2013): + return None, None + return results[0].span(), years + + +def generate_year_strings(years): + """Handle generating URL parts like '2010.2012' or search result additions + like (2012 to 2014). """ + if years is None: + year_string = "" + year_url_param = "" + elif len(years) == 1: + year_string = " (%s)" % years[0] + year_url_param = "%s/" % years[0] + else: + year_string = " (%s to %s)" % (years[0], years[1]) + year_url_param = "%s.%s/" % (years[0], years[1]) + return year_string, year_url_param + + +def remove_spans(string, spans): + """Given a list of (start, end) index pairs, remove all those from a + string. This is tricky because if you remove them one by one the indices + are off. """ + + if len(spans) == 0: + return string + + result = [] + + span_iter = iter(spans) + current_span = span_iter.next() + + for idx, c in enumerate(string): + + if idx < current_span[0]: + result.append(c) + elif idx >= current_span[1]: + + current_span = next(span_iter, None) + if current_span is not None: + if not (current_span[0] <= idx < current_span[1]): + result.append(c) + else: + result.append(string[idx:]) + break + + return "".join(result) + + +def make_extractor(compiled_regex, remove_extracted=True, + remove_only_matches=False): + """Given a regex, gives you back a function that'll use that regex to + extract data from a string. Specifically it: + + 1. Finds all strings matched by the regex + 2. Returns those strings and their start and end positions as a list of + tuples: + [(("cat", "fish"), ((2, 5),(8, 12))), + (("dog", "bone"), ((14, 27),))] + 3. Optionally removes those strings from the original string. + 4. Returns the original string, possibly unchanged depending on (3) + + :param remove_extracted: Whether to remove the extracted string from the + original or not. + :param remove_only_matches: If the regex has multiple capturing parentheses + (a.k.a groups), this will remove only the parenthesized part. So if set to + True, given r"(\d{4}) to (\d{4})", this will remove both numbers AND the ' + to ', as opposed to removing just the numbers. It will also return multiple + spans for each removed part. """ + def extractor(query): + + matches = re.finditer(compiled_regex, query) + results = [] + spans = () + + for match in matches: + if remove_extracted: + + if remove_only_matches: + # Remove match groups individually + group_indices = range(1, len(match.group()) + 1) + match_spans = tuple(match.span(i) for i in group_indices) + else: + # Remove whole match at once + match_spans = (match.span(), ) + + spans += match_spans + + results.append(((match.group(),), match_spans)) + + query = remove_spans(query, spans) + return results, query + + return extractor + + +# Extractors to run on query string, in order. +# elasticsearch field -> extractor function +EXTRACTORS = OrderedDict([ + ("regions", make_extractor(REGIONS_RE)), + ("api_name", make_extractor(API_NAMES_RE)), + ("app_name", make_extractor(APP_NAMES_RE)), + ("trade_flow", make_extractor(TRADE_FLOWS_RE)), + ("product_code", make_extractor(PRODUCT_CODE_RE)), + ("product_community", make_extractor(PRODUCT_COMMUNITY_RE)), +]) + + +def parse_search(query): + """Given a search query string, figure out what kind of search it is.""" + + kwargs = {} + query_type = None + + # Extract years like in "germany france 2012 2014" + span, years = extract_years(query) + if years is not None: + # Strip out year expression from query since elasticsearch doesn't + # contain year data + query = query[:span[0]] + query[span[1]:] + kwargs["years"] = years + kwargs["year_string"], kwargs["year_url_param"] = \ + generate_year_strings(years) + + # It matters that years get extracted before product codes since it's much + # likelier that '2012' is a year than a product code. Years are checked to + # be within valid bounds, anything that's not a valid year doesn't get + # stripped from the query and thus can potentially be found as a product + # code. + + # Extract the remaining common fields like region, product codes etc. + for extractor_name, extractor in EXTRACTORS.iteritems(): + result, query = extractor(query) + if len(result): + kwargs[extractor_name] = (x[0][0] for x in result) + + # Determine query type + if len(query) == 4 and query in API_NAMES: + query_type = "api" + else: + pass + + return query, query_type, kwargs + + +def prepare_filters(kwargs): + + filters = defaultdict(list) + + for key in EXTRACTORS.keys(): + if key in kwargs: + filters[key] += kwargs[key] + + return filters + + +def api_search(request): + + query = request.GET.get("term", None) + if query is None: + return HttpResponse("[]") + + query, query_type, kwargs = parse_search(query) + filters = prepare_filters(kwargs) + + es_query = { + "query": { + "filtered": {} + }, + "size": 8 + } + + # Add filters to the query if they were given. Filters are ANDed. + if filters: + es_filters = [{"terms": {k: v}} for k, v in filters.iteritems()] + es_filters = {"bool": {"must": es_filters}} + es_query["query"]["filtered"]["filter"] = es_filters + + # Add fuzzy search for query string if any non-filter query string remains + # after taking out the filters + if query.strip() != "": + es_query["query"]["filtered"]["query"] = { + "fuzzy_like_this": { + "like_text": query, + "fields": ["title"], + "fuzziness": 3, + "max_query_terms": 15, + "prefix_length": 4 + } + } + + # Do the query + es = Elasticsearch() + result = es.search(index="questions", body=es_query) + + # Format the results in a way that complies with the OpenSearch standard's + # suggestion extension + labels = [] + urls = [] + for x in result['hits']['hits']: + label = x['_source']['title'] + kwargs.get('year_string', '') + url = x['_source']['url'] + kwargs.get('year_url_param', '') + # TODO: This is a hack, the correct way is to generate the url here + # instead of pregenerating it. See issue # 134 + if len(kwargs.get('years', '')) > 1: + url = url.replace("tree_map", "stacked") + labels.append(label) + urls.append(settings.HTTP_HOST + url) + + return HttpResponse(json.dumps([ + query, + labels, + [], + urls + ])) diff --git a/html/explore/index.html b/html/explore/index.html index 9cbb1645..1d330452 100644 --- a/html/explore/index.html +++ b/html/explore/index.html @@ -22,6 +22,7 @@ {% endif %} + The Atlas Of Economic Complexity{% if title %} | {{title}}{% else %}{% endif %} diff --git a/html/opensearch.xml b/html/opensearch.xml new file mode 100644 index 00000000..906a3d9f --- /dev/null +++ b/html/opensearch.xml @@ -0,0 +1,11 @@ + + + Economic Complexity + Search the Atlas of Economic Complexity + + en + + + + {{HTTP_HOST}}search/ + diff --git a/html/searchresults.html b/html/searchresults.html new file mode 100644 index 00000000..8ef9b46a --- /dev/null +++ b/html/searchresults.html @@ -0,0 +1,19 @@ + + + + + Atlas of Economic Complexity | Search + + + + + + + + +

Search the Atlas of Economic Complexity

+ + + diff --git a/html/template.html b/html/template.html index ef78f726..90cf3c45 100644 --- a/html/template.html +++ b/html/template.html @@ -8,6 +8,7 @@ The Atlas of Economic Complexity | {% block page_title %}{% endblock %} + diff --git a/html/test_search.html b/html/test_search.html deleted file mode 100644 index 08b61455..00000000 --- a/html/test_search.html +++ /dev/null @@ -1,62 +0,0 @@ - - - - -jQuery UI Autocomplete - Remote with caching - - - - - - - - -
- - -
- - diff --git a/media/css/searchresults.css b/media/css/searchresults.css new file mode 100644 index 00000000..fc1480b5 --- /dev/null +++ b/media/css/searchresults.css @@ -0,0 +1,24 @@ +h1 { + font-family: "Gill Sans", "Gill Sans MT", "Helvetica Neue", Calibri, Arial, sans-serif; + font-weight: 300; + text-align: center; + margin-top: 110px; +} + +#search { + text-align: center; + margin-top: 40px; +} +#search input { + font-size: 14px; + width: 400px; +} + +.ui-menu { + width: 390px; +} +.ui-menu .ui-menu-item { + padding-bottom: 5px; + border-bottom: 1px solid #ccc; +} + diff --git a/media/js/searchresults.js b/media/js/searchresults.js new file mode 100644 index 00000000..511a0b6b --- /dev/null +++ b/media/js/searchresults.js @@ -0,0 +1,61 @@ +// Function to read query parameters +function getQueryParameterByName(name) { + var match = RegExp('[?&]' + name + '=([^&]*)').exec(window.location.search); + return match && decodeURIComponent(match[1].replace(/\+/g, ' ')); +} + +// jQuery hack to highlight search terms +$(function() { +$.ui.autocomplete.prototype._renderItem = function( ul, item){ + var term = this.term.split(' ').join('|'); + var re = new RegExp("(" + term + ")", "gi") ; + var t = item.label.replace(re,"$1"); + return $( "
  • " ) + .data( "item.autocomplete", item ) + .append( "" + t + "" ) + .appendTo( ul ); +}; + +// Set up autocomplete callback +var cache = {}; +$("#searchbar").autocomplete({ +minLength: 3, +delay: 260, +source: function(request, response) { + var term = request.term; + if (term in cache) { + response(cache[term]); + return; + } + $.getJSON( "../api/search/", + request, + function(data, status, xhr) { + var reshaped_data = []; + for (var i = 0; i < data[1].length; i++){ + reshaped_data.push({label: data[1][i], value: data[3][i]}); + } + cache[term] = reshaped_data; + response(reshaped_data); + }); + }, +select: function(event, ui){ + // Go to selected URL + event.preventDefault(); + $(this).val(ui.item.label); + window.location.href=ui.item.value; +}, +focus: function(event, ui){ + // Get rid of behavior where keyboard up down arrow replaces textbox with + // url instead of search result. + event.preventDefault(); +}, +}); + + +querystring = getQueryParameterByName("term"); +var bar = $("#searchbar"); +bar.val(querystring); +bar.autocomplete("search", querystring); +bar.focus(); + + });