accs_for_couchdb2neo4j.py

#!/usr/bin/python
#
# Contains accessories (Some functions and some dicts) to convert from OSDF syntax to what will be loaded in Neo4j.

# Mapping from OSDF node type to Neo4J node type (Case, File, Tags) or MIMARKS, Mixs
nodes = {
    'project': 'Case',
    'study': 'Case',
    'subject': 'Case',
    'subject_attr': 'Case',
    'subject_attribute': 'Case',
    'visit': 'Case',
    'visit_attr': 'Case',
    'visit_attribute': 'Case',
    'sample': 'Case',
    'sample_attr': 'Case',
    'sample_attribute': 'Case',
    'wgs_dna_prep': 'File',
    'host_seq_prep': 'File',
    'wgs_raw_seq_set': 'File',
    'wgs_raw_seq_set_private': 'File',
    'host_wgs_raw_seq_set': 'File',
    'microb_transcriptomics_raw_seq_set': 'File',
    'host_transcriptomics_raw_seq_set': 'File',
    'wgs_assembled_seq_set': 'File',
    'viral_seq_set': 'File',
    'annotation': 'File',
    'clustered_seq_set': 'File',
    '16s_dna_prep': 'File',
    '16s_raw_seq_set': 'File',
    '16s_trimmed_seq_set': 'File',
    'microb_assay_prep': 'File',
    'host_assay_prep': 'File',
    'proteome': 'File',
    'metabolome': 'File',
    'lipidome': 'File',
    'cytokine': 'File',
    'abundance_matrix': 'File',
    'tags': 'Tags',
    'mimarks': 'MIMARKS',
    'mixs': 'Mixs',
    'reference_genome_project_catalog_entry': 'File',
    'host_epigenetics_raw_seq_set': 'File',
    'serology': 'File',
    'metagenomic_project_catalog_entry': 'File',
    'alignment': 'File',
    'proteome_nonpride': 'File',
    'host_variant_call': 'File'
}

# OSDF node types that map to Neo4J File nodes
file_nodes = {}
for n in nodes:
    if nodes[n] == 'File':
        file_nodes[n] = True

# These are all the different edge types present in the schema. 
# Note that 'subset_of' will be removed after loading in order to 
# comply with iHMP schema.
edges = {
    'part_of': 'PART_OF',
    'subset_of': 'SUBSET_OF',
    'participates_in': 'PARTICIPATES_IN',
    'associated_with': 'ASSOCIATED_WITH',
    'by': 'BY',
    'collected_during': 'COLLECTED_DURING',
    'prepared_from': 'PREPARED_FROM',
    'sequenced_from': 'SEQUENCED_FROM',
    'derived_from': 'DERIVED_FROM',
    'computed_from': 'COMPUTED_FROM',
    'has_tag': 'HAS_TAG',
    'has_mimarks': 'HAS_MIMARKS',
    'has_mixs': 'HAS_MIXS'
}

# Known edges used in the dump scripts
definitive_edges = {
    'part_of': 'Case',
    'subset_of': 'Case',
    'participates_in': 'Case',
    'by': 'Case',
    'associated_with': 'Case',
    'collected_during': 'Case',
    'prepared_from': 'Case',
    'has_tag': 'Tags',
    'has_mimarks': 'MIMARKS',
    'has_mixs': 'Mixs'
}

# Known edges used in the mirror script
definitive_edges2 = {
    'part_of': 'Case',
    'subset_of': 'Case',
    'participates_in': 'Case',
    'by': 'Case',
    'associated_with': 'Case',
    'collected_during': 'Case',
    'prepared_from': 'Case', 
    'computed_from': 'File' 
}

# Remap the study names using these values
study_name_dict = {
    'Healthy Human Subjects':'HHS',
    'Human microbiome project 16S production phase I.':'16S-PP1',
    'Human microbiome project 16S production phase II.':'16S-PP2',
    'Skin Microbiome in Disease States: Atopic Dermatitis and Immunodeficiency.':'16S-SM-ADI',
    'The Thrifty Microbiome: The Role of the Gut Microbiota in Obesity in the Amish.':'16S-GM-AO',
    "Diet, Genetic Factors, and the Gut Microbiome in Crohn's Disease.":'16S-GM-CD2',
    'Foregut Microbiome in Development of Esophageal Adenocarcinoma.':'16S-GM-EA',
    'The Role of the Gut Microbiota in Ulcerative Colitis, Targeted Gene Survey.':'16S-GM-UC',
    'The Human Microbiome in Pediatric Abdominal Pain and Intestinal Inflammation.':'16S-GM-CGD',
    "Effect of Crohn's Disease Risk Alleles on Enteric Microbiota.":'16S-GM-CD',
    'Evaluation of the Cutaneous Microbiome in Psoriasis.':'16S-SM-P',
    'The Neonatal Microbiome and Necrotizing Enterocolitis.':'16S-GM-NE',
    'The Microbial Ecology of Bacterial Vaginosis: A Fine Scale Resolution Metagenomic Study.':'16S-VM-BV',
    'The Vaginal Microbiome: Disease, Genetics and the Environment, 16S Gene Survey.':'16S-VM-DGE',
    'Urethral Microbiome of Adolescent Males.':'16S-UM-AD',
    "Metagenomic Analysis of the Structure and Function of the Human Gut Microbiota in Crohn's Disease.":'WGS-GM-CD',
    'The Role of the Gut Microbiota in Ulcerative Colitis, Whole Metagenome Sequencing Project.':'WGS-GM-UC',
    'The Human Virome in Children And Its Relationship to Febrile Illness.':'WGS-VIR-FE',
    'Human microbiome project WGS production phase II.':'WGS-PP2',
    'Human microbiome project WGS production phase I.':'WGS-PP1',
    'ibdmdb':'IBDMDB',
    'momspi':'MOMS-PI',
    'Inflammatory Bowel Disease Multi-omics Database (IBDMDB)':'IBDMDB',
    'prediabetes':'T2D'
}

# Remap data format values 
file_format_dict = {
    'sff':'Standard Flowgram File',
    'peptide_fsa':'Peptide FASTA',
    'gff3':'GFF3',
    'nucleotide_fsa':'Nucleotide FASTA',
    'null':'null',
    'fastq':'FASTQ',
    'biom':'Biological Observation Matrix',
    'fasta':'FASTA',
    'csv':'CSV',
    'mzXML':'Mass Spectroscopy Proteomics'
}

# Remap body product values
body_product_dict = {
    'Feces [FMA:64183]':'feces',
    'Saliva [FMA:59862]':'saliva',
    'vaginal mucosa [UBERON:0004983]':'vaginal mucosa',
    'Stool':'feces',
    'stool':'feces',
    'saliva [UBERON:0001836]':'saliva',
    'cervical mucus [UBERON:0012248]':'cervical mucus',
    'feces [UBERON:0001988]':'feces',
    'blood':'blood',
    'Nasal':'nasal',
    'urinary_tract':'urinary tract',
    'nasal':'nasal'
}

# Need this to add consistency to the body sites for query purposes. 
body_site_dict = {
    'abdomen': 'abdomen [FMA:9577]',
    'antecubital_fossa': 'cubital fossa [FMA:39848]',
    'anterior_nares': 'external naris [FMA:59645]',
    'attached_keratinized_gingiva': 'gingiva [FMA:59762]',
    'back': 'back [FMA:14181]',
    'blood': 'blood cell [FMA:62844]',
    'buccal_mucosa': 'buccal mucosa [FMA:59785]',
    'Buccal mucosa [FMA:59785]': 'buccal mucosa [FMA:59785]',
    'cervix': 'cervix of uterus [FMA:17740]',
    'Dorsum of tongue [FMA:54651]': 'dorsum of tongue [FMA:54651]',
    'elbow': 'elbow [FMA:24901]',
    'External naris [FMA:59645]': 'external naris [FMA:59645]',
    'FMA:276108': 'right nasal cavity [FMA:276108]',
    'FMA:326482': 'urinary tract [FMA:326482]',
    'FMA:64183': 'feces [FMA:64183]',
    'FMA:86713': 'peripheral blood mononuclear cell [FMA:86713]',
    'FMA:7842': 'angle of seventh rib [FMA:7842]',
    'foot': 'foot [FMA:9664]',
    'forearm': 'forearm [FMA:9663]',
    'Gastrointestinal tract [FMA:71132]': 'gastrointestinal tract [FMA:71132]',
    'gingiva [FMA:59762]': 'gingiva [FMA:59762]',
    'Gingiva [FMA:59762]': 'gingiva [FMA:59762]',
    'gut': 'gastrointestinal tract [FMA:71132]',
    'hand': 'hand [FMA:9712]',
    'hard_palate': 'hard palate [FMA:55023]',
    'Hard palate [FMA:55023]': 'hard palate [FMA:55023]',
    'head': 'head [FMA:7154]',
    'ileal_pouch': 'ileum [FMA:7208]',
    'ileum': 'ileum [FMA:7208]',
    'knee': 'knee [FMA:24974]',
    'left_antecubital_fossa': 'left cubital fossa [FMA:39850]',
    'left_retroauricular_crease': 'skin of left auriculotemporal part of head [FMA:70332]',
    'leg': 'leg [FMA:24979]',
    'mid_vagina': 'vagina [FMA:19949]',
    'nare': 'external naris [FMA:59645]',
    'nasal': 'nasal cavity [FMA:54378]',
    'nasopharynx': 'nasopharynx [FMA:54878]',
    'Nasopharynx [FMA:54878]': 'nasopharynx [FMA:54878]',
    'oral_cavity': 'oral cavity [FMA:20292]',
    'Oral cavity [FMA:20292]': 'oral cavity [FMA:20292]',
    'Orifice of vagina [FMA:19984]': 'orifice of vagina [FMA:19984]',
    'Palantine tonsil [FMA:9610]': 'palatine tonsil [FMA:9610]',
    'Palatine tonsil [FMA:9610]': 'palatine tonsil [FMA:9610]',
    'palatine_tonsils': 'palatine tonsil [FMA:9610]',
    'perianal_region': 'perianal space [FMA:29719]',
    'Plasma [FMA:62970]': 'plasma [FMA:62970]',
    'popliteal_fossa': 'popliteal fossa [FMA:22525]',
    'posterior_fornix': 'posterior fornix of vagina [FMA:19987]',
    'Posterior fornix of vagina [FMA:19987]': 'posterior fornix of vagina [FMA:19987]',
    'rectal': 'rectum [FMA:14544]',
    'right_antecubital_fossa': 'right cubital fossa [FMA:39849]',
    'right cubital fossa [FMA:39849]': 'right cubital fossa [FMA:39849]',
    'right_retroauricular_crease': 'skin of right auriculotemporal part of head [FMA:70331]',
    'saliva': 'portion of saliva [FMA:59862]',
    'scalp': 'scalp [FMA:46494]',
    'shin': 'anterior part of leg [FMA:24985]',
    'shoulder': 'shoulder [FMA:25202]',
    'Skin of left auriculotemporal part of head [FMA:70332]': 'skin of left auriculotemporal part of head [FMA:70332]',
    'Skin of right auriculotemporal part of head [FMA:70331]': 'skin of right auriculotemporal part of head [FMA:70331]',
    'stool': 'feces [FMA:64183]',
    'subgingival_plaque': 'gingiva [FMA:59762]',
    'supragingival_plaque': 'gingiva [FMA:59762]',
    'test': 'test',
    'thigh': 'thigh [FMA:24967]',
    'throat': 'throat [FMA:228738]',
    'Throat [FMA:228738]': 'throat [FMA:228738]',
    'tongue_dorsum': 'dorsum of tongue [FMA:54651]',
    'unknown': 'unknown',
    'urethra': 'urethra [FMA:19667]',
    'urinary_tract': 'urinary tract [FMA:326482]',
    'Vagina [FMA:19949]': 'vagina [FMA:19949]',
    'vaginal': 'vagina [FMA:19949]',
    'vaginal_introitus': 'orifice of vagina [FMA:19984]',
    'volar_forearm': 'forearm [FMA:9663]',
    'wall_of_vagina': 'wall of vagina [FMA:19971]',
}

# A dict that purges the FMA code from the data
fma_free_body_site_dict = {
    'abdomen': 'abdomen',
    'antecubital_fossa': 'cubital fossa',
    'anterior_nares': 'external naris',
    'ascending_colon': 'ascending colon',
    'attached_keratinized_gingiva': 'gingiva',
    'back': 'back',
    'blood': 'blood cell',
    'buccal_mucosa': 'buccal mucosa',
    'Buccal mucosa [FMA:59785]': 'buccal mucosa',
    'cerebrospinal_fluid': 'cerebrospinal fluid',
    'cervix': 'cervix of uterus',
    'descending_colon': 'descending colon',
    'Dorsum of tongue [FMA:54651]': 'dorsum of tongue',
    'elbow': 'elbow',
    'External naris [FMA:59645]': 'external naris',
    'FMA:276108': 'right nasal cavity',
    'FMA:326482': 'urinary tract',
    'FMA:64183': 'feces',
    'FMA:86713': 'peripheral blood mononuclear cell',
    'FMA:7842': 'angle of seventh rib',
    'foot': 'foot',
    'forearm': 'forearm',
    'gall_bladder': 'gall bladder',
    'gastric_antrum': 'gastric antrum',
    'Gastrointestinal tract [FMA:71132]': 'gastrointestinal tract',
    'gingiva [FMA:59762]': 'gingiva',
    'Gingiva [FMA:59762]': 'gingiva',
    'gingival_crevices': 'gingiva',
    'gut': 'gastrointestinal tract',
    'hand': 'hand',
    'hard_palate': 'hard palate',
    'Hard palate [FMA:55023]': 'hard palate',
    'head': 'head',
    'ileal_pouch': 'ileum',
    'ileal-anal_pouch': 'ileal-anal pouch',
    'ileum': 'ileum',
    'knee': 'knee',
    'left_antecubital_fossa': 'left cubital fossa',
    'left_arm': 'left arm',
    'left_retroauricular_crease': 'left retroauricular crease',
    'leg': 'leg',
    'lung_aspirate': 'lung aspirate',
    'lymph_node': 'lymph node',
    'mid_vagina': 'vagina',
    'nare': 'external naris',
    'nasal': 'nasal cavity',
    'nasopharynx': 'nasopharynx',
    'Nasopharynx [FMA:54878]': 'nasopharynx',
    'oral_cavity': 'oral cavity',
    'Oral cavity [FMA:20292]': 'oral cavity',
    'Orifice of vagina [FMA:19984]': 'orifice of vagina',
    'Palantine tonsil [FMA:9610]': 'palatine tonsil',
    'Palatine tonsil [FMA:9610]': 'palatine tonsil',
    'palatine_tonsils': 'palatine tonsil',
    'perianal_region': 'perianal space',
    'Plasma [FMA:62970]': 'plasma',
    'popliteal_fossa': 'popliteal fossa',
    'posterior_fornix': 'posterior fornix of vagina',
    'Posterior fornix of vagina [FMA:19987]': 'posterior fornix of vagina',
    'rectal': 'rectum',
    'respiratory_tract': 'respiratory tract',
    'right_antecubital_fossa': 'right cubital fossa',
    'right cubital fossa [FMA:39849]': 'right cubital fossa',
    'right_retroauricular_crease': 'right retroauricular crease',
    'saliva': 'portion of saliva',
    'scalp': 'scalp',
    'shin': 'anterior part of leg',
    'shoulder': 'shoulder',
    'sigmoid_colon': 'sigmoid colon',
    'Skin of left auriculotemporal part of head [FMA:70332]': 'left retroauricular crease',
    'Skin of right auriculotemporal part of head [FMA:70331]': 'right retroauricular crease',
    'spinal_cord': 'spinal cord',
    'stool': 'feces',
    'subgingival_plaque': 'gingiva',
    'supragingival_plaque': 'gingiva',
    'synovial_fluid': 'synovial fluid',
    'terminal_ileum': 'terminal ileum',
    'thigh': 'thigh',
    'throat': 'throat',
    'Throat [FMA:228738]': 'throat',
    'tongue_dorsum': 'dorsum of tongue',
    'transverse_colon': 'transverse colon',
    'unknown': 'unknown',
    'upper_respiratory_tract': 'upper respiratory tract',
    'urethra': 'urethra',
    'urinary_tract': 'urinary tract',
    'Vagina [FMA:19949]': 'vagina',
    'vaginal': 'vagina',
    'vaginal_introitus': 'orifice of vagina',
    'volar_forearm': 'forearm',
    'wall_of_vagina': 'wall of vagina',
}

# be explicit about which metadata makes it through the _attr nodes so as not
# to be redundant with information like study_name
meta_to_keep = {
    '30m_gluc',
    '60m_gluc',
    'abdominal_pain',
    'abx',
    'activity_30d',
    'activity_3m',
    'activity_change_30d',
    'activity_change_3m',
    'acute_dis',
    'aerobics',
    'age',
    'alcohol',
    'allergies',
    'anger',
    'arthralgia',
    'asthma',
    'beans',
    'biscuit',
    'bmi',
    'bowel_day',
    'bowel_night',
    'bread',
    'bread_spread',
    'breadrolls',
    'breakfast_amt',
    'breakfast_food',
    'breakfast_tod',
    'cad',
    'cancer',
    'cancer_mtc',
    'cereal',
    'cereal_type',
    'cheese',
    'chemo',
    'chest_pain',
    'chf',
    'chips_crisps',
    'chronic_dis',
    'claudication',
    'colonoscopy',
    'comment',
    'confident',
    'contact',
    'control',
    'coping',
    'current',
    'dairy',
    'diabetes',
    'diag_other',
    'diarrhea',
    'diet_drinks',
    'difficulties',
    'dinner_amt',
    'dinner_food',
    'dinner_tod',
    'duration',
    'dyspnea',
    'education',
    'eggs',
    'ery_nodosum',
    'family_history',
    'fast_gluc',
    'father',
    'fecalcal',
    'fever',
    'fish',
    'fish_count',
    'fish_oil',
    'fish_white',
    'fruit',
    'fruit_count',
    'gallbladder',
    'going_your_way',
    'grains',
    'hbi',
    'hbi_total',
    'height',
    'hosp',
    'hyperlipidemia',
    'hypertension',
    'ice_cream',
    'illicit_drug',
    'immunosupp',
    'irritation',
    'juice',
    'kidney',
    'leg_edema',
    'liver',
    'lunch_amt',
    'lunch_food',
    'lunch_tod',
    'meat',
    'meat_product',
    'meat_red',
    'meat_white',
    'milk',
    'mod_activity_days',
    'mod_activity_hours',
    'mod_activity_minutes',
    'mother',
    'neurologic',
    'new_meds',
    'occupation',
    'on_top',
    'oral_contrast',
    'osa',
    'other_food_intake',
    'pancreatitis',
    'pastry',
    'poultry',
    'preg_plans',
    'pregnant',
    'prior',
    'probiotic',
    'psychiatric',
    'pvd',
    'pyo_gangrenosum',
    'rash',
    'rx',
    'salt',
    'sccai',
    'sccai_total',
    'self_assess',
    'self_condition',
    'shellfish',
    'siblings',
    'soda',
    'starch',
    'starch_type',
    'stool_blood',
    'stool_soft',
    'stopped_meds',
    'stress',
    'stress_def',
    'study_disease_description',
    'study_disease_disease_ontology_id',
    'study_disease_mesh_id',
    'study_disease_name',
    'study_disease_nci_id',
    'study_disease_status',
    'study_disease_umls_concept_id',
    'subtype',
    'sugar',
    'sugar_drinks',
    'surgery',
    'survey_id',
    'sweets',
    'sweets_count',
    'upset',
    'urgency_def',
    'uveitis',
    'veg',
    'veg_green',
    'veg_raw',
    'veg_root',
    'vig_activity_days',
    'vig_activity_hours',
    'vig_activity_minutes',
    'walking_days',
    'walking_hours',
    'walking_minutes',
    'water',
    'weight',
    'weight_change',
    'weight_diff',
    'work_missed',
    'yogurt'
}

# standardizing all values that add no real info to the metadata to simply be NA 
# these won't actually be stored in the database, but when metadata for a cart 
# is pulled these missing vals will default to NA
meta_null_vals = {
    'nan',
    'none',
    'n/a',
    'not applicable',
    'unknown/undiagnosed',
    'dad',
    'mom',
    'unknown/not reported',
    ''
}

# keys that should persist one step down (e.g. need dinner_tod and not just tod for all breakfast/lunch/dinner)
keys_to_keep = {
    'walking',
    'mod_activity',
    'vig_activity',
    'study_disease',
    'study_disease_status',
    'snacks',
    'breakfast',
    'lunch',
    'dinner'
}  

# set of IDs from OSDF that were test nodes and are pending removal from OSDF
ignore = {
    '88af6472fb03642dd5eaf8cddc37b0f3',
    '88af6472fb03642dd5eaf8cddc2f50b1',
    '88af6472fb03642dd5eaf8cddc2f07c1',
    '88af6472fb03642dd5eaf8cddc712ed7',
    '932d8fbc70ae8f856028b3f67cfab1ed',
    'b9af32d3ab623bcfbdce2ea3a502c015',
    '610a4911a5ca67de12cdc1e4b4014cd0',
    '610a4911a5ca67de12cdc1e4b40135fe',
    '610a4911a5ca67de12cdc1e4b4014133',
    '610a4911a5ca67de12cdc1e4b40156e8',
    '610a4911a5ca67de12cdc1e4b40164de',
    '610a4911a5ca67de12cdc1e4b4017467',
    '610a4911a5ca67de12cdc1e4b4017ab9',
    '9bb18fe313e7fe94bf243da07e000de0',
    '9bb18fe313e7fe94bf243da07e00107e',
    'b9af32d3ab623bcfbdce2ea3a5016b61',
    '9bb18fe313e7fe94bf243da07e003ac0',
    '419d64483ec86c1fb9a94025f3b94551',
    '88af6472fb03642dd5eaf8cddc70c8ec',
    '88af6472fb03642dd5eaf8cddc70d1de',
    '858ed4564f11795ec13dda4c109b345f',
    '67ff3a7b9227c8c6f1db4bbf2226fc4b',
    '67ff3a7b9227c8c6f1db4bbf2227079e',
    '88af6472fb03642dd5eaf8cddc2f4cb4',
    '88af6472fb03642dd5eaf8cddc2f4340',
    '194149ed5273e3f94fc60a9ba5001573',
    '194149ed5273e3f94fc60a9ba59d2c9f',
    '88af6472fb03642dd5eaf8cddc2f5abe',
    '9bb18fe313e7fe94bf243da07e0032e4',
    '88af6472fb03642dd5eaf8cddc2f3405',
    '194149ed5273e3f94fc60a9ba50069b0',
    '88af6472fb03642dd5eaf8cddc714325',
    '5a950f27980b5d93e4c16da1243b7c05',
    '5a950f27980b5d93e4c16da1243b821c',
    '52d8c92f2d3660b9add954d544a02d90'
}

node_type_mapping = {
    # top-level mapping on node_type
    '_key': 'node_type',
    'wgs_raw_seq_set': {'data_modality': 'whole metagenome', 'data_type': 'sequence', 'organism_type': 'bacterial'},
    'host_wgs_raw_seq_set': {'data_modality': 'whole genome', 'data_type': 'sequence', 'organism_type': 'host'},
    'microb_transcriptomics_raw_seq_set': {'data_modality': 'metatranscriptome', 'data_type': 'sequence', 'organism_type': 'bacterial'},
    'host_transcriptomics_raw_seq_set': {'data_modality': 'transcriptome', 'data_type': 'sequence', 'organism_type': 'host'},
    # TODO - change data_modality to "epigenetics" or "epigenomics"?
    'host_epigenetics_raw_seq_set': {'data_modality': 'whole genome', 'data_type': 'sequence', 'organism_type': 'host'},
    # TODO - change data_modality to "variation" or "genomic variation"?
    'host_variant_call': {'data_modality': 'whole genome', 'data_type': 'sequence', 'organism_type': 'host'},
    '16s_raw_seq_set': {'data_modality': 'marker sequence', 'data_type': 'sequence', 'organism_type': 'bacterial'},
    '16s_trimmed_seq_set': {'data_modality': 'marker sequence', 'data_type': 'sequence', 'organism_type': 'bacterial'},
    'proteome': {'data_modality': 'proteome', 'data_type': 'abundance', 'organism_type': 'host'},
    'metaproteome': {'data_modality': 'metaproteome', 'data_type': 'abundance', 'organism_type': 'multi-organism'},
    'metabolome': {'data_modality': 'metabolome', 'data_type': 'abundance', 'organism_type': 'host'},

    # organism_type = dependent upon parent assay node (either 'host' or 'bacterial')
    'lipidome': {'data_modality': 'lipidome', 'data_type': 'abundance', 'organism_type': { '_key': 'parent' }, 'abundance_type': 'lipidome'},
    'cytokine': {'data_modality': 'cytokine', 'data_type': 'abundance', 'organism_type': { '_key': 'parent' }, 'abundance_type': 'transcriptome'},
    'serology': {'data_modality': 'serology', 'data_type': 'abundance', 'organism_type': { '_key': 'parent' }, 'abundance_type': 'serology'},

    # TODO: metametabolome: metabolome - abundance - multi-organism

    'abundance_matrix': {
        # sub-mapping on matrix_type
        '_key': 'matrix_type',
        'wgs_functional': {'data_modality': 'whole metagenome', 'data_type': 'abundance', 'organism_type': 'bacterial', 'abundance_type': 'functional'},
        'wgs_community': {'data_modality': 'whole metagenome', 'data_type': 'abundance', 'organism_type': 'bacterial', 'abundance_type': 'community'},
        '16s_community': {'data_modality': 'marker sequence', 'data_type': 'abundance', 'organism_type': 'bacterial', 'abundance_type': 'community'},
        'microb_metatranscriptome': {'data_modality': 'metatranscriptome', 'data_type': 'abundance', 'organism_type': 'bacterial', 'abundance_type': 'transcriptome'},
        'microb_metabolome': {'data_modality': 'metabolome', 'data_type': 'abundance', 'organism_type': 'bacterial', 'abundance_type': 'metabolome'},
        'microb_proteomic': {'data_modality': 'proteome', 'data_type': 'abundance', 'organism_type': 'bacterial', 'abundance_type': 'proteome'},
        'host_transcriptome': {'data_modality': 'transcriptome', 'data_type': 'abundance', 'organism_type': 'host', 'abundance_type': 'transcriptome'},
        'host_cytokine': {'data_modality': 'cytokine', 'data_type': 'abundance', 'organism_type': 'host', 'abundance_type': 'transcriptome'},
        'host_lipidomic': {'data_modality': 'lipidome', 'data_type': 'abundance', 'organism_type': 'host', 'abundance_type': 'lipidome'}
        },

    'alignment': {
        # sub-mapping on study - don't want to assume the same mapping for non-HMP data
        '_key': 'study',
        'Human microbiome project WGS production phase I.': {'data_modality': 'whole metagenome', 'data_type': 'alignment', 'organism_type': 'bacterial'}
        },

    'annotation': {
        '_key': 'study',
        'Human microbiome project WGS production phase I.':
            {
            # sub-mapping on subtype
            '_key': 'subtype',
            # multi-FASTA protein files
            'hmgi': {'data_modality': 'metaproteome', 'data_type': 'sequence', 'organism_type': 'bacterial'},
            'hmhgi': {'data_modality': 'metaproteome', 'data_type': 'sequence', 'organism_type': 'bacterial'},
            'wgs_annotation': {'data_modality': 'metaproteome', 'data_type': 'sequence', 'organism_type': 'bacterial'},
            # GFF files
            'hmgi2': {'data_modality': 'metatranscriptome', 'data_type': 'sequence', 'organism_type': 'bacterial'},
            'hmcgi2': {'data_modality': 'metatranscriptome', 'data_type': 'sequence', 'organism_type': 'bacterial'}
            }

        },

    'clustered_seq_set': {
        '_key': 'abbrev',
        'HMGC': {'data_modality': 'metaproteome', 'data_type': 'sequence', 'organism_type': 'bacterial'},
        'HMGC2': {'data_modality': 'metaproteome', 'data_type': 'sequence', 'organism_type': 'bacterial'}
    },
    'viral_seq_set': {'data_modality': 'whole metagenome', 'data_type': 'sequence', 'organism_type': 'viral'},
    'proteome_nonpride': {'data_modality': 'proteome', 'data_type': 'abundance', 'organism_type': 'host'},
    'wgs_assembled_seq_set': {'data_modality': 'whole metagenome', 'data_type': 'sequence', 'organism_type': 'bacterial'}

    }