Skip to content
This repository has been archived by the owner on Feb 28, 2024. It is now read-only.

Commit

Permalink
Basic script for indexing all raw documents
Browse files Browse the repository at this point in the history
  • Loading branch information
anupdhml committed May 14, 2019
1 parent 3356ce5 commit 6d83229
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 4 deletions.
26 changes: 26 additions & 0 deletions elasticsearch/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
Source documents for our elasticsearch instance. Currently includes results from crawling various websites.

A document here is a json file that follows the schema laid out in our elasticsearch [templates](../scripts/set_nepali_template.sh).

TODO keep the document files in a separate repo -- they are here now for convenience.

## Indexing

This assumes that the akshara elaticsearch instance is already up and running -- for more details on that, please refer to the main [README](../README.md).

The commands here are meant to be run from the same host as our elasticsearch instance.

```
# prep the cluster, if not done already (this is safe to run anytime)
../setup_akshara_cluster.sh
# drop the current index, if needed (eg: on mapping changes or new field additions)
# all docs are written to akshara_nepali index right now
curl -XDELETE "${HOSTNAME}:9200/akshara_nepali"
# index all the documents in a given folder
./indexer.sh crawled
# basic stats on the indices
curl "${HOSTNAME}:9200/_cat/indices?v"
```
35 changes: 35 additions & 0 deletions elasticsearch/data/indexer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
#
# indexer.sh
#
# Index all the raw akshara documents
#
# TODO use bulk api and do this all from python, so that we can do other
# things like indexing the docs along with autocomplete suggestions.
#
# Usage: ./indexer.sh [RAW_DATA_DIR]

# exit the script when a command fails
set -o errexit

# cacth exit status for piped commands
set -o pipefail

DEFAULT_RAW_DATA_DIR="crawled"

# if no arg is provided to the script, use the default
RAW_DATA_DIR="${1:-${DEFAULT_RAW_DATA_DIR}}"

if [ ! -d "$RAW_DATA_DIR" ]; then
echo "${RAW_DATA_DIR}: No such directory"
exit 1
fi

#INDEXING_SCRIPT="echo"
INDEXING_SCRIPT="../test/index_akshara.sh"

###############################################################################

echo "Indexing json files in directory: ${RAW_DATA_DIR}"

find "$RAW_DATA_DIR" -type f -name '*.json' -print0 | xargs -0 -I{} $INDEXING_SCRIPT '{}'
1 change: 1 addition & 0 deletions elasticsearch/data/samples/kavitakosh/sample_doc.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"author": "\u0905\u0917\u092e\u0926\u093f\u0932\u0926\u093e\u0938", "genre": "\u0915\u0935\u093f\u0924\u093e", "source_link": "http://kavitakosh.org/kk/%E0%A4%85%E0%A4%97%E0%A4%AE%E0%A4%A6%E0%A4%BF%E0%A4%B2%E0%A4%A6%E0%A4%BE%E0%A4%B8", "source": "\u0928\u0947\u092a\u093e\u0932\u0940 - \u0915\u0935\u093f\u0924\u093e \u0915\u094b\u0936", "title": "\u0930\u093e\u0917 \u0928\u093f\u0930\u094d\u0917\u0941\u0923 / \u0905\u0917\u092e\u0926\u093f\u0932\u0926\u093e\u0938", "text": "\u0930\u093e\u092e \u0939\u0930\u093f \u092d\u091c\u0940 \u0909\u0924\u093e\u0930\u0947 \u0928\u0926\u093f\u092f\u093e \u092a\u093e\u0930\u0915\u0947 \u0918\u0930 \u092e\u094b\u0939\u0928 \u092e\u093e\u092f\u093e \u0964\u0964\u091f\u0947\u0915\u0964\u0964\n\u092e\u094b\u0939\u0915\u094b \u091c\u0928\u094d\u091c\u0940\u0930 \u0924\u094b\u0921\u0940 \u0924\u0939\u093e\u0901 \u092e\u093e\u092f\u093e \u092d\u0941\u0932 \u0938\u094c \u092d\u0941\u0932\u0947 \u0906\u091c\n\u092e\u0924 \u092d\u0942\u0932\u094b \u0905\u092e\u0930 \u0939\u093e\u091f \u0928 \u0917\u0941\u092e\u093e\u090a, \u092e\u094b\u0939\u0928 \u092e\u093e\u092f\u093e \u0964\u0964\u0967\u0964\u0964\n\u092e\u094b\u0939\u0915\u0947 \u092c\u093e\u0926\u0932\u0941 \u092d\u0930\u0947\u090a \u092f\u0939\u093e\u0901 \u0924\u0939\u093e\u0901 \u0916\u0947\u0932 \u0938\u094b \u0916\u0947\u0932\n\u092c\u0928\u0940 \u0938\u092c \u0906\u0908\n\u091a\u0932\u093f\u090f \u0938\u0916\u0947\u0930 \u0905\u092e\u094d\u092e\u0930 \u0927\u093e\u092e\u092e\u093e \u091c\u093e\u090a\u0901, \u092e\u094b\u0939\u0928 \u092e\u093e\u092f\u093e \u0964\u0964\u0968\u0964\u0964\n\u0938\u093e\u0901\u091a\u093e\u0915\u094b \u0938\u093e\u0901\u091a\u0940 \u0938\u093e\u0947\u0939\u0940 \u092e\u0928 \u092c\u093e\u0901\u0927\u0940 \u091a\u0932\u093f\u092f \u0936\u093e\u0928\u094d\u0924 \u0935\u0939\u093f \u0926\u0947\u0936\n\u091c\u0939\u093e\u0901 \u0906\u0916\u0947\u091f \u091a\u093e\u0939\u0947 \u0928\u0940\u091c \u091a\u093e\u092e\u094d\u092e\u093e \u091c\u093e\u0909\u0901, \u092e\u094b\u0939\u0928 \u092e\u093e\u092f\u093e \u0964\u0964\u0969\u0964\u0964\n\u0938\u0941\u0930\u0924 \u0915\u0940 \u091a\u093e\u0939\u093e\u0930\u0940 \u0932\u094d\u092f\u094c \u0939\u093e\u0924 \u0928\u093e\u092e\u0936\u093f \u0938\u0941\u0928,\n\u0924\u0941\u0908 \u0928\u0924\u093e\u0928 \u0921\u0917\u092e\u0917 \u0928\u0939\u093f \u0938\u0941\u0930\u0924 \u0938\u093e\u0927\u0940\n\u092a\u0930\u092e\u094d \u0927\u093e\u092e\u094d\u092e\u093e \u091c\u093e\u0928\u0941 \u092f\u0939\u093f \u0930\u0940\u0924 \u092a\u093e\u0930\u0940, \u092e\u094b\u0939\u0928 \u092e\u093e\u092f\u093e \u0964\u0964\u096a\u0964\u0964\n\u0915\u0939\u0947 \u0926\u093e\u0938 \u0905\u0917\u092e\u0926\u093f\u0932 \u0938\u0941\u0928\u094b \u092d\u093e\u0908 \u0938\u093e\u0927\u0941\n\u091a\u093e\u0939\u0947 \u0935\u093f\u091a\u093e\u0939\u0947 \u0928\u093e\u0925\u0932\u0940 \u0913\u0918\u094b \u0909\u0920\u0940 \u0939\u0947\u0930\u0941\u0902 \u0926\u0947\u0936\n\u092a\u0930\u092e \u0927\u093e\u092e \u091c\u093e\u0928 \u092a\u0930\u0940 \u092a\u0941\u0922\u094b \u092e\u094b\u0939\u0928 \u092e\u093e\u092f\u093e", "lang": "\u0928\u0947\u092a\u093e\u0932\u0940"}
1 change: 1 addition & 0 deletions elasticsearch/data/samples/sahityasangraha/sample_doc.json

Large diffs are not rendered by default.

12 changes: 8 additions & 4 deletions elasticsearch/test/index_akshara.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ fi

ELASTICSEARCH_URL="http://${HOSTNAME}:9200"

INDEX="akshara_nepali_test"
#INDEX="akshara_nepali_test" # for testing
INDEX="akshara_nepali" # prod index name

PIPELINE="akshara_pipeline"

# as unix time
Expand All @@ -37,16 +39,18 @@ PIPELINE="akshara_pipeline"
##############################################################################

#echo "Deleting index ${INDEX} (if it exists)..."
#curl --silent -XDELETE "${ELASTICSEARCH_URL}/${INDEX}"
#curl --silent --show-error --request DELETE "${ELASTICSEARCH_URL}/${INDEX}"
#echo ""

for file in "$@"; do
echo "Indexing ${file}..."
echo ""
echo "Indexing '${file}' to index '${INDEX}'"

content=$(<"$file")
#echo "$content"

curl -XPOST "${ELASTICSEARCH_URL}/${INDEX}/_doc/?pipeline=${PIPELINE}&pretty" \
curl --silent --show-error --request POST \
"${ELASTICSEARCH_URL}/${INDEX}/_doc/?pipeline=${PIPELINE}&pretty" \
--header "Content-Type: application/json" \
--data "$content"

Expand Down

0 comments on commit 6d83229

Please sign in to comment.