BrendelGroup · vpbrendel · Mar 2, 2020 · Feb 4, 2020 · Feb 10, 2020 · Feb 10, 2020
diff --git a/03-b-daphnia.ipynb b/03-b-daphnia.ipynb
diff --git a/03-c-volvox.md b/03-c-volvox.md
@@ -11,9 +11,9 @@ We then use a custom Python script to assign each iLocus a provisional status ba
 
 ```bash
 cd chlorophyta
-genhub-build.py --cfgdir=config/ --batch=chlorophyta+ \
-                --workdir=../data/ --numprocs=4 \
-                download format prepare stats cluster
+fidibus --cfgdir=config/ --refrbatch=chlorophyta+ \
+                --workdir=../data/ --numprocs=13 \
+                download prep iloci breakdown stats cluster
 python status.py GenHub.hiloci.tsv > Chlorophyta.hiLocus.pre-status.tsv
 ```
 

diff --git a/Atha/AT_iloci.tsv b/Atha/AT_iloci.tsv
diff --git a/Atha/AT_miloci.tsv b/Atha/AT_miloci.tsv
diff --git a/Atha/AT_piloci.tsv b/Atha/AT_piloci.tsv
diff --git a/Atha/README.md b/Atha/README.md
diff --git a/Atha/phisigma-Atha-min2Mb.tsv b/Atha/phisigma-Atha-min2Mb.tsv
diff --git a/Atha/phisigma-Atha-min500kb.tsv b/Atha/phisigma-Atha-min500kb.tsv
diff --git a/Atha/phisigma-Atha.tsv b/Atha/phisigma-Atha.tsv
diff --git a/compare/Amel/Amel.iloci.tsv b/compare/Amel/Amel.iloci.tsv
diff --git a/compare/Amel/Amh3.iloci.tsv b/compare/Amel/Amh3.iloci.tsv
diff --git a/compare/Amel/chaining.sh b/compare/Amel/chaining.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -e 
+set -u
+set -o pipefail
+
+lastz Amh3.iloci.fa[multiple] Amel.iloci.fa --match=1,9 --filter=identity:95 --chain \
+		  format=general:name1,length1,size1,name2,length2,size2,identity,nmatch \
+		  > entire.tsv
+lastz Amh3.iloci.fa[multiple] Amel.filoci.fa --match=1,9 --filter=identity:95 --chain \
+		  format=general:name1,length1,size1,name2,length2,size2,identity,nmatch \
+		  > fi.tsv
+lastz Amh3.iloci.fa[multiple] Amel.ciloci.fa --match=1,9 --filter=identity:95 --chain \
+		  format=general:name1,length1,size1,name2,length2,size2,identity,nmatch \
+		  > ci.tsv
+lastz Amh3.iloci.fa[multiple] Amel.niloci.fa --match=1,9 --filter=identity:95 --chain \
+                  format=general:name1,length1,size1,name2,length2,size2,identity,nmatch \
+                  > ni.tsv
+lastz Amh3.iloci.fa[multiple] Amel.iiloci.fa --match=1,9 --filter=identity:95 --chain \
+                  format=general:name1,length1,size1,name2,length2,size2,identity,nmatch \
+                  > ii.tsv
+lastz Amh3.iloci.fa[multiple] Amel.siloci.fa --match=1,9 --filter=identity:95 --chain \
+                  format=general:name1,length1,size1,name2,length2,size2,identity,nmatch \
+                  > si.tsv
diff --git a/compare/Amel/ci-count.py b/compare/Amel/ci-count.py
@@ -0,0 +1,52 @@
+import pickle
+import pandas as pd
+
+species = "Amh3"
+iloci = pd.read_csv(species + '.iloci.tsv',sep='\t')
+ii = iloci['LocusClass'] == "iiLocus"
+iiloci = iloci[ii]
+fi = iloci['LocusClass'] == "fiLocus"
+filoci = iloci[fi]
+siloci = iloci[iloci['LocusClass'] == "siLocus"]
+niloci = iloci[iloci['LocusClass'] == "niLocus"]
+ciloci = iloci[iloci['LocusClass'] == "ciLocus"]
+
+ii_ids = set(iiloci['LocusId'])
+fi_ids = set(filoci['LocusId'])
+si_ids = set(siloci['LocusId'])
+ni_ids = set(niloci['LocusId'])
+ci_ids = set(ciloci['LocusId'])
+
+with open('Amel_ci-relations','rb') as f:
+    rels = pickle.load(f)
+si_count = 0
+ci_count = 0
+ii_count = 0
+ni_count = 0
+fi_count = 0
+ties = 0
+for key in rels:
+    if len(rels[key]) < 1: 
+        continue
+    elif len(rels[key]) > 1: 
+        ties += 1
+    else:
+        for match in rels[key]:
+            if match in si_ids:
+                si_count += 1
+            elif match in ci_ids:
+                ci_count += 1
+            elif match in ii_ids:
+                ii_count += 1
+            elif match in ni_ids:
+                ni_count += 1
+            elif match in fi_ids:
+                fi_count += 1
+            else:
+                raise ValueError("Wrong id")
+print(si_count)
+print(ci_count)
+print(ni_count)
+print(ii_count)
+print(fi_count)
+print(ties)
diff --git a/compare/Amel/ci-hsp.py b/compare/Amel/ci-hsp.py
@@ -0,0 +1,54 @@
+import pandas as pd
+import pickle
+blast = pd.read_csv("ci.tsv", sep='\t')
+blast[['num1','num2']] = blast['identity'].str.split('/',expand=True)
+blast[['num1','num2']] = blast[['num1','num2']].apply(pd.to_numeric)
+blast.rename(columns = {'#name1' : 'name1'}, inplace = True)
+iloci = list(set(blast.name2))
+num_matches = 0
+# Warning: this isn't the total number of giLoci. 
+# These are giLoci that have some chains, we later filter out based on 95% length
+# To get the actual number of giLoci, we need to go back to the giLoci data files
+
+num_conserved = 0
+relations = {}
+locus_lengths = {}
+for locus in iloci: 
+    indices = blast.name2 == locus
+    ilocus = blast[indices]
+    query_length = ilocus.iloc[0]['size2']
+    match_loci = list(set(ilocus['name1']))
+    chains = {}
+    for match in match_loci:
+        indices = ilocus.name1 == match
+        hsp = ilocus[indices]
+        length = hsp['nmatch'].max()
+        if length / query_length >= 0.9:
+            chains[match] = length
+    try:
+        targets = [key for m in [max(chains.values())] for key,val in chains.items() if val == m]
+        if chains[targets[0]] > ilocus[ilocus.name1 == targets[0]].iloc[0]['size1'] * 0.9:
+            num_conserved += 1
+        if len(targets) > 1:
+            ids = {}
+            for target in targets:
+                search = ilocus[ilocus.name1 == target]
+                if len(search) > 1:
+                    max_len = search['nmatch'].max()
+                    search = search[search.nmatch == max_len]
+                assert len(search) <= 1
+                ids[target] = search['num1'].sum() / search['num2'].sum()
+                tiebreakers = [key for m in [max(ids.values())] for key,val in ids.items() if val == m]
+            relations[locus] = tiebreakers
+        else:
+            relations[locus] = targets
+    except ValueError:
+        continue
+with open('Amel_ci-relations','wb') as f:
+    pickle.dump(relations,f)
+for locus in relations:
+    if len(relations[locus]) > 0:
+        num_matches += 1
+
+print(str(num_matches) + ' iLoci had at least one match')
+print('Conserved: ' + str(num_conserved))