Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add tests everywhere #59

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
_book
bundle
*.asdf

#Mac OS
*.DS_Store
Expand Down
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ _install: &_install
- gimme 1.8
- source ~/.gimme/envs/latest.env
- pip install --upgrade pip
- pip install codecov
- pip install -r requirements.txt codecov
- pip install -e .
_coverage: &_coverage
- SCRIPT="coverage run --concurrency=multiprocessing -m unittest discover && coverage combine"
Expand All @@ -38,6 +38,7 @@ before_script:
- docker run -d --privileged -p 9432:9432 --name bblfshd bblfsh/bblfshd
- docker exec -it bblfshd bblfshctl driver install python bblfsh/python-driver
- docker run --name scylla -p 9042:9042 -d scylladb/scylla --developer-mode=1
- sleep 90
script:
- (eval "$SCRIPT")
notifications:
Expand Down
18 changes: 15 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ Apollo is structured as a series of commands in CLI. It stores data in [Cassandr
writes MinHashCuda batches on disk. Community detection is delegated to [igraph](http://igraph.org/python/).

* `resetdb` (erases) and initializes a Cassandra keyspace.
* `preprocess` creates indexes for the files and features, and stores them on disk.
Runs source{d} engine through PySpark.
* `bags` extracts the features, stores them in the database and writes MinHashCuda batches on disk.
Runs source{d} engine through PySpark.
* `hash` performs the hashing, writes the hashtables to the database and hashing parameters on disk
Expand Down Expand Up @@ -94,13 +96,23 @@ in your browser. There multiple Docker options available, e.g.

## Docker command snippets

### Preprocess

```
docker run -it --rm -v /path/to/io:/io --link bblfshd srcd/apollo preprocess -r /io/siva \
--cached-index-path /io/bags/index.asdf --docfreq-out /io/bags/docfreq.asdf -f id lit uast2seq --uast2seq-seq-len 4 \
-l Java Python -s 'local[*]' --min-docfreq 5 --bblfsh bblfshd --persist MEMORY_ONLY \
--config spark.executor.memory=4G spark.driver.memory=10G spark.driver.maxResultSize=4G
```

### Bags

```
docker run -it --rm -v /path/to/io:/io --link bblfshd --link scylla srcd/apollo bags -r /io/siva \
--bow /io/bags/bow.asdf --docfreq /io/bags/docfreq.asdf -f id lit uast2seq --uast2seq-seq-len 4 \
-l Java Python -s 'local[*]' --min-docfreq 5 --bblfsh bblfshd --cassandra scylla --persist MEMORY_ONLY \
--config spark.executor.memory=4G spark.driver.memory=10G spark.driver.maxResultSize=4G
--bow /io/bags/bow.asdf --cached-index-path /io/bags/index.asdf --docfreq-in /io/bags/docfreq.asdf \
-f id lit uast2seq --uast2seq-seq-len 4 -l Java Python -s 'local[*]' --min-docfreq 5 --bblfsh bblfshd \
--cassandra scylla --persist MEMORY_ONLY --config spark.executor.memory=4G \
spark.driver.memory=10G spark.driver.maxResultSize=4G
```

### Hash
Expand Down
3 changes: 1 addition & 2 deletions apollo/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@
from modelforge.logs import setup_logging
from sourced.ml import extractors
from sourced.ml.utils import add_engine_args, add_spark_args
from sourced.ml.cmd import ArgumentDefaultsHelpFormatterNoNone
from sourced.ml.cmd.args import add_bow_args, add_feature_args, add_repo2_args, \
add_df_args, add_repartitioner_arg
add_df_args, add_repartitioner_arg, ArgumentDefaultsHelpFormatterNoNone

from apollo.bags import preprocess, source2bags
from apollo.cassandra_utils import reset_db
Expand Down
4 changes: 2 additions & 2 deletions apollo/bags.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pyspark.sql.types import Row
from sourced.ml.cmd import repos2bow_template, repos2bow_index_template
from sourced.ml.cmd import repos2bow_template, repos2bow_index
from sourced.ml.transformers import Transformer

from apollo import cassandra_utils
Expand Down Expand Up @@ -46,7 +46,7 @@ def __call__(self, head):


def preprocess(args):
return repos2bow_index_template(args)
return repos2bow_index(args)


def source2bags(args):
Expand Down
4 changes: 2 additions & 2 deletions apollo/cassandra_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def _pump(self):
rows = self.session.execute(query)
buffer = self.buffer
buffer.extend(None for _ in items)
l = len(items) # noqa
num_items = len(items)
count = 0
for r in rows:
count += 1
Expand All @@ -138,7 +138,7 @@ def _pump(self):
m = None
# reverse order - we will pop() in __next__
tr = r.sha1, (r.repo, r.commit, r.path)
buffer[l - i - 1] = (tr + (m,)) if meta else tr
buffer[num_items - i - 1] = (tr + (m,)) if meta else tr
self._log.debug("-> %d", count)


Expand Down
3 changes: 3 additions & 0 deletions apollo/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,9 @@ def _generate_tree(self):
indptr[i + 1] = pos
return {"data": data, "indptr": indptr, "elements": merge_strings(self.id_to_element)}

def dump(self):
return "Number of communities: %s" % (len(self.communities))

def count_elements(self):
return sum(sum(1 for i in c if i < len(self.id_to_element)) for c in self.communities)

Expand Down
31 changes: 23 additions & 8 deletions doc/101.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,34 @@ Cassandra or ScyllaDB must be running.
Apollo works with Git repositories stored in [Siva](https://github.com/src-d/go-siva) format.
Refer to [Borges](https://github.com/src-d/borges). We expect that the files will be in `/data` below.

### Index the files

We create the `OrderedDocumentFrequency` storing the index and global value
frequencies of the features, `QuantizationLevels` models if need be, and a `DocumentFrequency`
model which holds the index of all files.

```
apollo preprocess -r /data --cached-index-path index.asdf --docfreq docfreq.asdf \
-f lit id uast2seq --uast2seq-seq-len 4 --uast2seq-weight 2 --min-docfreq 4 \
-l Java Python --persist DISK_ONLY
```

> Docker users should add `--bblfsh bblfshd`.

More about [`preprocess`](cmd/preprocess.md).

### Extract the features

We convert every file into a [weighted set of features](https://en.wikipedia.org/wiki/Bag-of-words_model).
The batches for the `hash` command are written to `./bow*.asdf` (by default splitted by 2 GB) and
the calculated global feature value frequencies are written to`./docfreq.asdf`. We use three
extractors: literals, identifiers and deterministic AST subpaths of size 4. We double the importance
of the latter features and throw away any values which appear in less than 4 files. Only Java source
code is analysed. We optimize the pipeline executing by using the disk cache to save
the [UASTs](https://doc.bblf.sh/uast/code-to-ast.html) between each pass. The extracted bags
are additionally saved in the database.
The batches for the `hash` command are written to `./bow*.asdf` (by default splitted by 2 GB).
We use three extractors: literals, identifiers and deterministic AST subpaths of size 4.
We double the importance of the latter features and throw away any values which appear
in less than 4 files. Only Java source code is analysed. We optimize the pipeline
executing by using the disk cache to save the [UASTs](https://doc.bblf.sh/uast/code-to-ast.html)
between each pass. The extracted bags are additionally saved in the database.

```
apollo bags -r /data --bow bow.asdf --docfreq docfreq.asdf \
apollo bags -r /data --bow bow.asdf --docfreq docfreq.asdf --cached-index-path index.asdf \
-f lit id uast2seq --uast2seq-seq-len 4 --uast2seq-weight 2 --min-docfreq 4 \
-l Java Python --persist DISK_ONLY
```
Expand Down
4 changes: 4 additions & 0 deletions doc/SUMMARY.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,7 @@
* [cmd](cmd/cmd.md)
* [dumpcmd](cmd/dumpcmd.md)
* [evalcc](cmd/evalcc.md)
* Models reference
* [Weighted MinHash parameters](model/wmh.md)
* [Connected components](model/cc.md)
* [Communities](model/cmd.md)
4 changes: 3 additions & 1 deletion doc/cmd/bags.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Bags command

This command converts input repositories to unordered weighted bags of features that are stored in DB, writes MinHashCuda batches, and writes the Ordered Documents Frequency model as well as the optional Quantization Levels model. You can specify the following arguments:
This command converts input repositories to unordered weighted bags of features that are stored in DB,
and writen as `BOW` models to be used as MinHashCuda batches. You can specify the following arguments:

- `-r`/`--repositories` : Path to the input files
- `--parquet`: If your input files are Parquet files
Expand All @@ -14,6 +15,7 @@ This command converts input repositories to unordered weighted bags of features
- `--docfreq-out`: Path to the output Ordered Document Frequency model (can not be used with `docfreq-in`)
- `-v`/`--vocabulary-size`: to specify the maximum vocabulary size, defaults to 10 million
- `--cached-index-path`: Path to a precomputed Document Frequency model storing an index of the documents to be extracted
- `--num-iterations`: to select the number of iterations over which the data will be processed, which can prevent failures if the amount of data is large, defaults to 1
- `--partitions`: to repartition data, this will specify new number of partitions
- `--shuffle`: to repartition data, this will allow data shuffling (vital if number of partitions increases !)
- [Feature arguments](features.md)
Expand Down
4 changes: 3 additions & 1 deletion doc/cmd/cc.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# CC command

This command runs the connected components analysis on previously created hash tables, you can specify the following arguments:
This command runs the connected components analysis on previously created hash tables,
and saves the CCs in [this `Model`](/doc/model/cc.md). You can specify the following arguments:

- `o`/`--output`: Path to the output Connected Components model
- [Cassandra/Scylla arguments](db.md)

4 changes: 3 additions & 1 deletion doc/cmd/cmd.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

__Currently does not work in Spark Cluster mode.__

This command runs the community detection on a Connected Components model, you can specify the following arguments:
This command runs the community detection on a previously created Connected Components
model, and saves them in CCs in [this `Model`](/doc/model/cmd.md). You can specify
the following arguments:

- `-i`/`--input`: Path to the input Connected Components model
- `-o`/`--output`: Path to the output Community Detection model
Expand Down
4 changes: 3 additions & 1 deletion doc/cmd/hash.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

__Currently does not work in Spark Cluster mode.__

This command applies the MinHashCUDA algorithm on previously written batches, stores hashes and hash tables in DB and saves the Weighted MinHash (WMH) parameters.
This command applies the MinHashCUDA algorithm on previously written batches,
stores hashes and hash tables in DB and saves the Weighted MinHash (WMH) parameters
in [this `Model`](/doc/model/wmh.md). You can specify the following arguments:

- `-i`/`--input`: Path to the input batch(es)
- `--seed`: Specific random generator (useful for cross execution comparisons), default to a random number depending of the time
Expand Down
31 changes: 31 additions & 0 deletions doc/model/cc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Connected Components Model

This model stores the connected components found in the pairwise similarity
graph after hashing by the `cc` command.

**A quick reminder**

A document hashes to as many buckets as there are hashtables, which means if there are
3 hashtables, then a document hashes to 3 buckets. The number of hashtables increases
as the similarity threshold decreases. Any two documents that hash to at least one bucket
in common are in the same component.

The model has the following parameters:

- `cc.id_to_cc`: a numpy array of integers of the size of the number of documents, where
document `i` is in the community number `cc.id_to_cc[i]`;
- `cc.id_to_elements`: like in `sourced.ml`'s `BOW` model, a Python dictionary
mapping each document to it's name, e.g. if documents are files, then `cc.id_to_elements[i]`
is file `i`'s filename;
- `cc.id_to_buckets`: a Scipy sparse CSR matrix of the shape `number of documents`
x `number of buckets`, where the element in row `i` and column `j` is equal to 1 if
document `i` hashes to buck `j`, and 0 if not.

Example:

```
from apollo.graph import ConnectedComponentsModel

cc = ConnectedComponentsModel().load("cc.asdf")
print(cc.dump()) # prints the number of CCs and documents
```
35 changes: 35 additions & 0 deletions doc/model/cmd.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Communities Model

This model stores the communities detected by the `cmd` command from a previously
created Connected Component model. It's contents heavily depends on the algorithm
chosen (and it's parameters), but more importantly by the edge creation method,
as is described in [the doc](/doc/cmd/cmd.md). Indeed, if the default linear method
is chosen, then the communities will not only consist of documents, but also
of **buckets**, as they will have been added to the CC graphs as artificial vertices.
This means that, in this case, some communities may consist *only* of buckets.

The model has the following parameters:

- `cc.id_to_elements`: like in `sourced.ml`'s `BOW` model, a Python dictionary
mapping each document to it's name, e.g. if documents are files, then `cc.id_to_elements[i]`
is file `i`'s filename;
- `cc.communities`: a list of lists of integers, where each integer in `cc.communities[i]`
is in the `i`th community. If an element `e` in a community is an integer smaller
then the length of the `cc.id_to_elements` dictionary, then it's a document. If not,
it is the bucket number `e - len(cc.id_to_elements)` in the Connected Components
model's `id_to_buckets` parameter which has been used as input.

The model also has this method:
- `cc.count_elements`: it counts the number of distinct documents in the communities
(not all documents in the dictionary may be in a community, as we don't care for
communities of one). Buckets are not counted by this method.

Example:

```
from apollo.graph import CommunitiesModel

cmd = CommunitiesModel().load("cc.asdf")
print(cmd.dump()) # prints the number of communities (even if containing only buckets)
print("Number of distinct documents: %s" % (cmd.count_elements()))
```
23 changes: 23 additions & 0 deletions doc/model/wmh.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Weighted MinHash Parameters Model

This model stores the parameters generated by `libMHCUDA`'s `minhash_cuda_retrieve_vars`
function, when running the `hash` command. Named like in Sergey Ioffe's paper,
the parameters are:

- `wmh.rs`: the quantization granularity;
- `wmh.ln_cs` : the logarithm of the Cauchy variates;
- `whh.betas`: the random offset.

All 3 are Numpy arrays of the shape: `hash size` x `number of features`. If you have
the wish, or need, to use the `hash` command multiple times, you should reuse this
model each time, or the result will not be accurate, as the parameters will be
regenerated at random.

Example:

```
from apollo.hasher import WeightedMinHashParameters

wmh = WeightedMinHashParameters().load("params.asdf")
print(wmh.dump()) # prints the shape of matrices
```
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ cassandra_driver==3.14.0
libMHCUDA==2.1.0
python-igraph==0.7.1.post6
jinja2==2.10
sourced-ml==0.6.0
sourced-ml[tf]==0.6.1
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"libMHCUDA >= 2.0, <3.0",
"jinja2 >=2.0, <3.0",
"python-igraph >= 0.7, <2.0",
"sourced-ml >= 0.6.0, <0.7"],
"sourced-ml[tf] >= 0.6.0, <0.7"],
package_data={"": ["LICENSE", "README.md"] + glob(path.join("apollo", "*.jinja2"))},
classifiers=[
"Development Status :: 3 - Alpha",
Expand Down
22 changes: 22 additions & 0 deletions tests/cluster_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from cassandra.cluster import Cluster, NoHostAvailable
from cassandra.policies import RoundRobinPolicy


def create_session(keyspace="apollo"):
cluster = Cluster(("localhost",), port=9042,
load_balancing_policy=RoundRobinPolicy())
try:
session = cluster.connect(keyspace)
except NoHostAvailable:
session = None
return session


def count_table(session, table, keyspace="apollo"):
for row in session.execute("SELECT COUNT(*) from %s.%s;" % (keyspace, table)):
return row.count


def extract_row(session, table, keyspace="apollo"):
for row in session.execute("SELECT * from %s.%s;" % (keyspace, table)):
return row
Binary file added tests/models/bow.asdf
Binary file not shown.
Binary file added tests/models/docfreq.asdf
Binary file not shown.
Binary file added tests/models/params.asdf
Binary file not shown.
10 changes: 10 additions & 0 deletions tests/raw_files/hello_world.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import sys


def main():
print("Hello, world")
return 0


if __name__ == "__main__":
sys.exit(main())
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading