src-d · r0mainK · Sep 3, 2018 · Sep 3, 2018 · Jun 15, 2018
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,5 @@
 _book
 bundle
-*.asdf
 
 #Mac OS
 *.DS_Store

diff --git a/.travis.yml b/.travis.yml
@@ -13,7 +13,7 @@ _install: &_install
   - gimme 1.8
   - source ~/.gimme/envs/latest.env
   - pip install --upgrade pip
-  - pip install codecov
+  - pip install -r requirements.txt codecov
   - pip install -e .
 _coverage: &_coverage
   - SCRIPT="coverage run --concurrency=multiprocessing -m unittest discover && coverage combine"
@@ -38,6 +38,7 @@ before_script:
   - docker run -d --privileged -p 9432:9432 --name bblfshd bblfsh/bblfshd
   - docker exec -it bblfshd bblfshctl driver install python bblfsh/python-driver
   - docker run --name scylla -p 9042:9042 -d scylladb/scylla --developer-mode=1
+  - sleep 90
 script:
   - (eval "$SCRIPT")
 notifications:

diff --git a/README.md b/README.md
@@ -55,6 +55,8 @@ Apollo is structured as a series of commands in CLI. It stores data in [Cassandr
 writes MinHashCuda batches on disk. Community detection is delegated to [igraph](http://igraph.org/python/).
 
 * `resetdb` (erases) and initializes a Cassandra keyspace.
+* `preprocess` creates indexes for the files and features, and stores them on disk.
+Runs source{d} engine through PySpark.
 * `bags` extracts the features, stores them in the database and writes MinHashCuda batches on disk.
 Runs source{d} engine through PySpark.
 * `hash` performs the hashing, writes the hashtables to the database and hashing parameters on disk
@@ -94,13 +96,23 @@ in your browser. There multiple Docker options available, e.g.
 
 ## Docker command snippets
 
+### Preprocess
+
+```
+docker run -it --rm -v /path/to/io:/io --link bblfshd srcd/apollo preprocess -r /io/siva \
+--cached-index-path /io/bags/index.asdf --docfreq-out /io/bags/docfreq.asdf -f id lit uast2seq --uast2seq-seq-len 4 \
+-l Java Python -s 'local[*]' --min-docfreq 5 --bblfsh bblfshd --persist MEMORY_ONLY \
+--config spark.executor.memory=4G spark.driver.memory=10G spark.driver.maxResultSize=4G
+```
+
 ### Bags
 
 ```
 docker run -it --rm -v /path/to/io:/io --link bblfshd --link scylla srcd/apollo bags -r /io/siva \
---bow /io/bags/bow.asdf --docfreq /io/bags/docfreq.asdf -f id lit uast2seq --uast2seq-seq-len 4 \
--l Java Python -s 'local[*]' --min-docfreq 5 --bblfsh bblfshd --cassandra scylla --persist MEMORY_ONLY \
---config spark.executor.memory=4G spark.driver.memory=10G spark.driver.maxResultSize=4G
+--bow /io/bags/bow.asdf --cached-index-path /io/bags/index.asdf --docfreq-in /io/bags/docfreq.asdf \
+-f id lit uast2seq --uast2seq-seq-len 4 -l Java Python -s 'local[*]' --min-docfreq 5 --bblfsh bblfshd \
+--cassandra scylla --persist MEMORY_ONLY --config spark.executor.memory=4G \
+spark.driver.memory=10G spark.driver.maxResultSize=4G
 ```
 
 ### Hash

diff --git a/apollo/__main__.py b/apollo/__main__.py
@@ -8,9 +8,8 @@
 from modelforge.logs import setup_logging
 from sourced.ml import extractors
 from sourced.ml.utils import add_engine_args, add_spark_args
-from sourced.ml.cmd import ArgumentDefaultsHelpFormatterNoNone
 from sourced.ml.cmd.args import add_bow_args, add_feature_args, add_repo2_args, \
-    add_df_args, add_repartitioner_arg
+    add_df_args, add_repartitioner_arg, ArgumentDefaultsHelpFormatterNoNone
 
 from apollo.bags import preprocess, source2bags
 from apollo.cassandra_utils import reset_db

diff --git a/apollo/bags.py b/apollo/bags.py
@@ -1,5 +1,5 @@
 from pyspark.sql.types import Row
-from sourced.ml.cmd import repos2bow_template, repos2bow_index_template
+from sourced.ml.cmd import repos2bow_template, repos2bow_index
 from sourced.ml.transformers import Transformer
 
 from apollo import cassandra_utils
@@ -46,7 +46,7 @@ def __call__(self, head):
 
 
 def preprocess(args):
-    return repos2bow_index_template(args)
+    return repos2bow_index(args)
 
 
 def source2bags(args):

diff --git a/apollo/cassandra_utils.py b/apollo/cassandra_utils.py
@@ -127,7 +127,7 @@ def _pump(self):
         rows = self.session.execute(query)
         buffer = self.buffer
         buffer.extend(None for _ in items)
-        l = len(items)  # noqa
+        num_items = len(items)
         count = 0
         for r in rows:
             count += 1
@@ -138,7 +138,7 @@ def _pump(self):
                 m = None
             # reverse order - we will pop() in __next__
             tr = r.sha1, (r.repo, r.commit, r.path)
-            buffer[l - i - 1] = (tr + (m,)) if meta else tr
+            buffer[num_items - i - 1] = (tr + (m,)) if meta else tr
         self._log.debug("-> %d", count)
 
 

diff --git a/apollo/graph.py b/apollo/graph.py
@@ -185,6 +185,9 @@ def _generate_tree(self):
             indptr[i + 1] = pos
         return {"data": data, "indptr": indptr, "elements": merge_strings(self.id_to_element)}
 
+    def dump(self):
+        return "Number of communities: %s" % (len(self.communities))
+
     def count_elements(self):
         return sum(sum(1 for i in c if i < len(self.id_to_element)) for c in self.communities)
 

diff --git a/doc/101.md b/doc/101.md
@@ -10,19 +10,34 @@ Cassandra or ScyllaDB must be running.
 Apollo works with Git repositories stored in [Siva](https://github.com/src-d/go-siva) format.
 Refer to [Borges](https://github.com/src-d/borges). We expect that the files will be in `/data` below.
 
+### Index the files
+
+We create the `OrderedDocumentFrequency` storing the index and global value 
+frequencies of the features, `QuantizationLevels` models if need be, and a `DocumentFrequency` 
+model which holds the index of all files.
+
+```
+apollo preprocess -r /data --cached-index-path index.asdf --docfreq docfreq.asdf \
+    -f lit id uast2seq --uast2seq-seq-len 4 --uast2seq-weight 2 --min-docfreq 4 \
+    -l Java Python --persist DISK_ONLY
+```
+
+> Docker users should add `--bblfsh bblfshd`.
+
+More about [`preprocess`](cmd/preprocess.md).
+
 ### Extract the features
 
 We convert every file into a [weighted set of features](https://en.wikipedia.org/wiki/Bag-of-words_model).
-The batches for the `hash` command are written to `./bow*.asdf` (by default splitted by 2 GB) and
-the calculated global feature value frequencies are written to`./docfreq.asdf`. We use three
-extractors: literals, identifiers and deterministic AST subpaths of size 4. We double the importance
-of the latter features and throw away any values which appear in less than 4 files. Only Java source
-code is analysed. We optimize the pipeline executing by using the disk cache to save
-the [UASTs](https://doc.bblf.sh/uast/code-to-ast.html) between each pass. The extracted bags
-are additionally saved in the database. 
+The batches for the `hash` command are written to `./bow*.asdf` (by default splitted by 2 GB).
+We use three extractors: literals, identifiers and deterministic AST subpaths of size 4.
+We double the importance of the latter features and throw away any values which appear 
+in less than 4 files. Only Java source code is analysed. We optimize the pipeline 
+executing by using the disk cache to save the [UASTs](https://doc.bblf.sh/uast/code-to-ast.html)
+between each pass. The extracted bags are additionally saved in the database.
 
 ```
-apollo bags -r /data --bow bow.asdf --docfreq docfreq.asdf \
+apollo bags -r /data --bow bow.asdf --docfreq docfreq.asdf --cached-index-path index.asdf \
     -f lit id uast2seq --uast2seq-seq-len 4 --uast2seq-weight 2 --min-docfreq 4 \
     -l Java Python --persist DISK_ONLY
 ```

diff --git a/doc/SUMMARY.md b/doc/SUMMARY.md
@@ -18,3 +18,7 @@
   * [cmd](cmd/cmd.md)
   * [dumpcmd](cmd/dumpcmd.md)
   * [evalcc](cmd/evalcc.md)
+ * Models reference
+  * [Weighted MinHash parameters](model/wmh.md)
+  * [Connected components](model/cc.md)
+  * [Communities](model/cmd.md)
diff --git a/doc/cmd/bags.md b/doc/cmd/bags.md
@@ -1,6 +1,7 @@
 # Bags command
 
-This command converts input repositories to unordered weighted bags of features that are stored in DB, writes MinHashCuda batches, and writes the Ordered Documents Frequency model as well as the optional Quantization Levels model. You can specify the following arguments:
+This command converts input repositories to unordered weighted bags of features that are stored in DB, 
+and writen as `BOW` models to be used as MinHashCuda batches. You can specify the following arguments:
 
 - `-r`/`--repositories` : Path to the input files
 - `--parquet`: If your input files are Parquet files
@@ -14,6 +15,7 @@ This command converts input repositories to unordered weighted bags of features
 - `--docfreq-out`: Path to the output Ordered Document Frequency model (can not be used with `docfreq-in`)
 - `-v`/`--vocabulary-size`: to specify the maximum vocabulary size, defaults to 10 million
 - `--cached-index-path`: Path to a precomputed Document Frequency model storing an index of the documents to be extracted
+- `--num-iterations`: to select the number of iterations over which the data will be processed, which can prevent failures if the amount of data is large, defaults to 1
 - `--partitions`: to repartition data, this will specify new number of partitions 
 - `--shuffle`: to repartition data, this will allow data shuffling (vital if number of partitions increases !) 
 - [Feature arguments](features.md)

diff --git a/doc/cmd/cc.md b/doc/cmd/cc.md
@@ -1,6 +1,8 @@
 # CC command
 
-This command runs the connected components analysis on previously created hash tables, you can specify the following arguments:
+This command runs the connected components analysis on previously created hash tables,
+and saves the CCs in [this `Model`](/doc/model/cc.md). You can specify the following arguments:
 
 - `o`/`--output`: Path to the output Connected Components model
 - [Cassandra/Scylla arguments](db.md)
+
diff --git a/doc/cmd/cmd.md b/doc/cmd/cmd.md
@@ -2,7 +2,9 @@
 
 __Currently does not work in Spark Cluster mode.__
 
-This command runs the community detection on a Connected Components model, you can specify the following arguments:
+This command runs the community detection on a previously created Connected Components 
+model, and saves them in CCs in [this `Model`](/doc/model/cmd.md). You can specify 
+the following arguments:
 
 - `-i`/`--input`: Path to the input Connected Components model
 - `-o`/`--output`: Path to the output Community Detection model

diff --git a/doc/cmd/hash.md b/doc/cmd/hash.md
@@ -2,7 +2,9 @@
 
 __Currently does not work in Spark Cluster mode.__
 
-This command applies the MinHashCUDA algorithm on previously written batches, stores hashes and hash tables in DB and saves the Weighted MinHash (WMH) parameters.
+This command applies the MinHashCUDA algorithm on previously written batches, 
+stores hashes and hash tables in DB and saves the Weighted MinHash (WMH) parameters
+in [this `Model`](/doc/model/wmh.md). You can specify the following arguments:
 
 - `-i`/`--input`: Path to the input batch(es)
 - `--seed`: Specific random generator (useful for cross execution comparisons), default to a random number depending of the time

diff --git a/doc/model/cc.md b/doc/model/cc.md
@@ -0,0 +1,31 @@
+# Connected Components Model
+
+This model stores the connected components found in the pairwise similarity
+graph after hashing by the `cc` command. 
+
+**A quick reminder**
+
+A document hashes to as many buckets as there are hashtables, which means if there are 
+3 hashtables, then a document hashes to 3 buckets. The number of hashtables increases 
+as the similarity threshold decreases. Any two documents that hash to at least one bucket 
+in common are in the same component.
+
+The model has the following parameters:
+
+- `cc.id_to_cc`: a numpy array of integers of the size of the number of documents, where
+document `i` is in the community number `cc.id_to_cc[i]`;
+- `cc.id_to_elements`: like in `sourced.ml`'s `BOW` model, a Python dictionary
+mapping each document to it's name, e.g. if documents are files, then `cc.id_to_elements[i]`
+is file `i`'s filename;
+- `cc.id_to_buckets`: a Scipy sparse CSR matrix of the shape `number of documents` 
+x `number of buckets`, where the element in row `i` and column `j` is equal to 1 if
+document `i` hashes to buck `j`, and 0 if not.
+
+Example:
+
+```
+from apollo.graph import ConnectedComponentsModel
+
+cc = ConnectedComponentsModel().load("cc.asdf")
+print(cc.dump())  # prints the number of CCs and documents
+```
diff --git a/doc/model/cmd.md b/doc/model/cmd.md
@@ -0,0 +1,35 @@
+# Communities Model
+
+This model stores the communities detected by the `cmd` command from a previously
+created Connected Component model. It's contents heavily depends on the algorithm 
+chosen (and it's parameters), but more importantly by the edge creation method, 
+as is described in [the doc](/doc/cmd/cmd.md). Indeed, if the default linear method 
+is chosen, then the communities will not only consist of documents, but also
+of **buckets**, as they will have been added to the CC graphs as artificial vertices. 
+This means that, in this case, some communities may consist *only* of buckets.
+
+The model has the following parameters:
+
+- `cc.id_to_elements`: like in `sourced.ml`'s `BOW` model, a Python dictionary
+mapping each document to it's name, e.g. if documents are files, then `cc.id_to_elements[i]`
+is file `i`'s filename;
+- `cc.communities`: a list of lists of integers, where each integer in `cc.communities[i]`
+is in the `i`th community. If an element `e` in a community is an integer smaller 
+then the length of the `cc.id_to_elements` dictionary, then it's a document. If not, 
+it is the bucket number `e - len(cc.id_to_elements)` in the Connected Components 
+model's `id_to_buckets` parameter which has been used as input.
+
+The model also has this method:
+- `cc.count_elements`: it counts the number of distinct documents in the communities
+(not all documents in the dictionary may be in a community, as we don't care for 
+communities of one). Buckets are not counted by this method. 
+
+Example:
+
+```
+from apollo.graph import CommunitiesModel
+
+cmd = CommunitiesModel().load("cc.asdf")
+print(cmd.dump())  # prints the number of communities (even if containing only buckets)
+print("Number of distinct documents: %s" % (cmd.count_elements()))
+```
diff --git a/doc/model/wmh.md b/doc/model/wmh.md
@@ -0,0 +1,23 @@
+# Weighted MinHash Parameters Model
+
+This model stores the parameters generated by `libMHCUDA`'s  `minhash_cuda_retrieve_vars`
+function, when running the `hash` command. Named like in Sergey Ioffe's paper, 
+the parameters are:
+
+- `wmh.rs`: the quantization granularity;
+- `wmh.ln_cs` : the logarithm of the Cauchy variates;
+- `whh.betas`: the random offset.
+
+All 3 are Numpy arrays of the shape: `hash size` x `number of features`. If you have
+the wish, or need, to use the `hash` command multiple times, you should reuse this
+model each time, or the result will not be accurate, as the parameters will be 
+regenerated at random.
+
+Example:
+
+```
+from apollo.hasher import WeightedMinHashParameters
+
+wmh = WeightedMinHashParameters().load("params.asdf")
+print(wmh.dump())  # prints the shape of matrices
+```
diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,4 @@ cassandra_driver==3.14.0
 libMHCUDA==2.1.0
 python-igraph==0.7.1.post6
 jinja2==2.10
-sourced-ml==0.6.0
+sourced-ml[tf]==0.6.1
diff --git a/setup.py b/setup.py
@@ -21,7 +21,7 @@
                       "libMHCUDA >= 2.0, <3.0",
                       "jinja2 >=2.0, <3.0",
                       "python-igraph >= 0.7, <2.0",
-                      "sourced-ml >= 0.6.0, <0.7"],
+                      "sourced-ml[tf] >= 0.6.0, <0.7"],
     package_data={"": ["LICENSE", "README.md"] + glob(path.join("apollo", "*.jinja2"))},
     classifiers=[
         "Development Status :: 3 - Alpha",

diff --git a/tests/cluster_utils.py b/tests/cluster_utils.py
@@ -0,0 +1,22 @@
+from cassandra.cluster import Cluster, NoHostAvailable
+from cassandra.policies import RoundRobinPolicy
+
+
+def create_session(keyspace="apollo"):
+    cluster = Cluster(("localhost",), port=9042,
+                      load_balancing_policy=RoundRobinPolicy())
+    try:
+        session = cluster.connect(keyspace)
+    except NoHostAvailable:
+        session = None
+    return session
+
+
+def count_table(session, table, keyspace="apollo"):
+    for row in session.execute("SELECT COUNT(*) from %s.%s;" % (keyspace, table)):
+        return row.count
+
+
+def extract_row(session, table, keyspace="apollo"):
+    for row in session.execute("SELECT * from %s.%s;" % (keyspace, table)):
+        return row
diff --git a/tests/models/bow.asdf b/tests/models/bow.asdf
diff --git a/tests/models/docfreq.asdf b/tests/models/docfreq.asdf
diff --git a/tests/models/params.asdf b/tests/models/params.asdf
diff --git a/tests/raw_files/hello_world.py b/tests/raw_files/hello_world.py
@@ -0,0 +1,10 @@
+import sys
+
+
+def main():
+    print("Hello, world")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/siva_files/2d58138f24fa863c235b0c33158b870a40c79ee2.siva b/tests/siva_files/2d58138f24fa863c235b0c33158b870a40c79ee2.siva
diff --git a/tests/siva_files/5d4a8bf30c0da7209f651632b62a362620556c85.siva b/tests/siva_files/5d4a8bf30c0da7209f651632b62a362620556c85.siva
diff --git a/tests/siva_files/aac052c42c501abf6aa8c3509424e837bb27e188.siva b/tests/siva_files/aac052c42c501abf6aa8c3509424e837bb27e188.siva
-Original file line number
+Diff line change
@@ -1,6 +1,5 @@
     _book
     bundle
-    *.asdf
     #Mac OS
     *.DS_Store
@@ Expand Down @@