diff --git a/README.md b/README.md index 383722b..bbf4003 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ TODO - parallel processing - KMV sketch-based acceleration - ability to limit output to top N items +- asymetrical metrics for inactive items - easy deployment: single, statically-linked executable - [unix philosophy](https://en.wikipedia.org/wiki/Unix_philosophy) @@ -97,23 +98,24 @@ i4 i5 0.0550 # CLI options -| option | info | -| -----------: | ---------------------------------------------------------------- | -| **i** | input path | -| **o** | output path (default: stdout) | -| **f** | output format, (default: ida,idb,cos) | -| **w** | number of workers (default: 1) | -| **k** | KMV sketch capacity, 0 for not using sketches (default: 0) | -| **ih** | input has header | -| **oh** | include header in output | -| **top** | output only top N results, 0 for all results (default: 0) | -| **topcol** | output column number for top N selection (1-based) (default: 3) | -| **buf** | line buffer capacity in MB (default: 100) | -| **coli** | column number of item id (1-based) (default: 1) | -| **colf** | column number of features (1-based) (default: 2) | -| **cmin** | minimum number of common features to show in output (default: 1) | -| **diag** | include diagonal in the output | -| **full** | include upper and lower triangle in the output | +| option | info | +| ------------: | ---------------------------------------------------------------- | +| **i** | input path | +| **o** | output path (default: stdout) | +| **f** | output format, (default: ida,idb,cos) | +| **iinactive** | inactive items input path (no header!) | +| **w** | number of workers (default: 1) | +| **k** | KMV sketch capacity, 0 for not using sketches (default: 0) | +| **ih** | input has header | +| **oh** | include header in output | +| **top** | output only top N results, 0 for all results (default: 0) | +| **topcol** | output column number for top N selection (1-based) (default: 3) | +| **buf** | line buffer capacity in MB (default: 100) | +| **coli** | column number of item id (1-based) (default: 1) | +| **colf** | column number of features (1-based) (default: 2) | +| **cmin** | minimum number of common features to show in output (default: 1) | +| **diag** | include diagonal in the output | +| **full** | include upper and lower triangle in the output | ## Output format diff --git a/src/thorvald.go b/src/thorvald.go index 0bcb7aa..59df92d 100644 --- a/src/thorvald.go +++ b/src/thorvald.go @@ -137,21 +137,22 @@ func other_triangle_format(fmt []string) []string { type Cfg struct { - input_path string - output_path string - buf_cap int - sketch_cap int - item_col int - top_n int - top_col int - features_col int - output_fmt string - workers int - c_min int - header_in bool - header_out bool - diagonal bool - full bool + input_path string + output_path string + inactive_path string + buf_cap int + sketch_cap int + item_col int + top_n int + top_col int + features_col int + output_fmt string + workers int + c_min int + header_in bool + header_out bool + diagonal bool + full bool } @@ -161,6 +162,7 @@ type Engine struct { cfg Cfg features_by_item map[string]map[uint32]bool range_by_item map[string]int + inactive map[string]bool all_features map[uint32]bool feature_freq map[uint32]int feature_idf map[uint32]float64 @@ -178,6 +180,22 @@ type Engine struct { output *log.Logger // handles concurrent writes } +func (e *Engine) load_inactive() { + if len(e.cfg.inactive_path)==0 { return } + file, err := os.Open(e.cfg.inactive_path) + check(err) + scanner := bufio.NewScanner(file) + scanner.Split(bufio.ScanLines) + + pg := Progress(0,"INACTIVE","items") + for scanner.Scan() { + id := scanner.Text() + e.inactive[id] = true + pg.Add(1) + } + file.Close() + pg.Close() +} func (e *Engine) load() { // --- SET / KMV SKETCH CONSTRUCTION -------------------------------------- @@ -307,7 +325,8 @@ func (e *Engine) calc_idf() { // TODO: String() type record struct { - val float32 + val float32 // required for top N sorting + idb string // required for inactive items detection str string } @@ -451,6 +470,7 @@ func (e *Engine) item_item(i int, j int, partition int) (out [2]record) { out[r].val = float32(val) } out[r].str = strings.Join(columns, "\t") + out[r].idb = mj } return out } @@ -478,11 +498,13 @@ func (e *Engine) calc_similarity() { records := make([]record, 2*e.items_cnt) for j:=j0; j