Skip to content

Commit

Permalink
Merge pull request #150 from mvdbeek/dataset_ref
Browse files Browse the repository at this point in the history
Add variant calling workflow, use URL input for GTF and fasta references
  • Loading branch information
dannon authored Oct 30, 2024
2 parents e175617 + bdcc97c commit 3871f97
Show file tree
Hide file tree
Showing 15 changed files with 2,490 additions and 1,510 deletions.
9 changes: 9 additions & 0 deletions app/apis/catalog/brc-analytics-catalog/common/constants.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import { ANALYSIS_METHOD, WORKFLOW_ID } from "./entities";

export const WORKFLOW_IDS_BY_ANALYSIS_METHOD: Partial<
Record<ANALYSIS_METHOD, WORKFLOW_ID>
> = {
[ANALYSIS_METHOD.REGULATION]: WORKFLOW_ID.REGULATION,
[ANALYSIS_METHOD.TRANSCRIPTOMICS]: WORKFLOW_ID.TRANSCRIPTOMICS,
[ANALYSIS_METHOD.VARIANT_CALLING]: WORKFLOW_ID.VARIANT_CALLING,
};
16 changes: 16 additions & 0 deletions app/apis/catalog/brc-analytics-catalog/common/entities.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
export enum ANALYSIS_METHOD {
ASSEMBLY = "ASSEMBLY",
GENOME_COMPARISONS = "GENOME_COMPARISONS",
PROTEIN_FOLDING = "PROTEIN_FOLDING",
REGULATION = "REGULATION",
TRANSCRIPTOMICS = "TRANSCRIPTOMICS",
VARIANT_CALLING = "VARIANT_CALLING",
}

export type BRCCatalog = BRCDataCatalogGenome;

export interface BRCDataCatalogGenome {
chromosomes: number;
contigs: number;
geneModelUrl: string;
genomeVersionAssemblyId: string;
ncbiTaxonomyId: string;
organism: string;
Expand All @@ -25,3 +35,9 @@ export interface EntitiesResponsePagination {
size: number;
total: number;
}

export enum WORKFLOW_ID {
REGULATION = "https://dockstore.org/api/ga4gh/trs/v2/tools/#workflow/github.com/iwc-workflows/chipseq-pe/main/versions/v0.12",
TRANSCRIPTOMICS = "https://dockstore.org/api/ga4gh/trs/v2/tools/#workflow/github.com/iwc-workflows/rnaseq-pe/main/versions/v0.9",
VARIANT_CALLING = "https://dockstore.org/api/ga4gh/trs/v2/tools/#workflow/github.com/iwc-workflows/haploid-variant-calling-wgs-pe/main/versions/v0.1",
}
30 changes: 25 additions & 5 deletions app/components/Entity/components/AnalysisMethod/analysisMethod.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,32 @@ import {
ANCHOR_TARGET,
REL_ATTRIBUTE,
} from "@databiosphere/findable-ui/lib/components/Links/common/entities";
import { useAsync } from "@databiosphere/findable-ui/src/hooks/useAsync";
import { Card } from "@mui/material";
import { WORKFLOW_IDS_BY_ANALYSIS_METHOD } from "app/apis/catalog/brc-analytics-catalog/common/constants";
import { getWorkflowLandingUrl } from "app/utils/galaxy-api";
import { ANALYSIS_METHOD } from "../../../../apis/catalog/brc-analytics-catalog/common/entities";
import {
StyledButtonPrimary,
StyledCardContent,
} from "./analysisMethod.styles";

export interface AnalysisMethodProps extends CardProps {
url: string;
analysisMethod: ANALYSIS_METHOD;
geneModelUrl: string;
genomeVersionAssemblyId: string;
}

export const AnalysisMethod = ({
analysisMethod,
geneModelUrl,
genomeVersionAssemblyId,
Paper = FluidPaper,
text,
title,
url,
}: AnalysisMethodProps): JSX.Element => {
const workflowId = WORKFLOW_IDS_BY_ANALYSIS_METHOD[analysisMethod];
const { data: landingUrl, isLoading, run } = useAsync<string>();
return (
<Card component={Paper}>
<CardSection>
Expand All @@ -31,16 +41,26 @@ export const AnalysisMethod = ({
<CardText>{text}</CardText>
</StyledCardContent>
<StyledButtonPrimary
disabled={!url}
onClick={(): void => {
disabled={!workflowId || isLoading}
onClick={async (): Promise<void> => {
if (!workflowId) return;
const url =
landingUrl ??
(await run(
getWorkflowLandingUrl(
workflowId,
genomeVersionAssemblyId,
geneModelUrl
)
));
window.open(
url,
ANCHOR_TARGET.BLANK,
REL_ATTRIBUTE.NO_OPENER_NO_REFERRER
);
}}
>
Analyze
{isLoading ? "Loading..." : "Analyze"}
</StyledButtonPrimary>
</CardSection>
</Card>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import { TEXT_HEADING } from "@databiosphere/findable-ui/lib/theme/common/typography";
import { Typography } from "@mui/material";

interface AnalysisMethodsTitleProps {
title: React.ReactNode;
}

export const AnalysisMethodsTitle = ({
title,
}: AnalysisMethodsTitleProps): JSX.Element => {
return (
<Typography color="ink.main" component="h2" variant={TEXT_HEADING}>
{title}
</Typography>
);
};
1 change: 1 addition & 0 deletions app/components/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ export { Link } from "@databiosphere/findable-ui/lib/components/Links/components
export { BasicCell } from "@databiosphere/findable-ui/lib/components/Table/components/TableCell/components/BasicCell/basicCell";
export { CopyText } from "./common/CopyText/copyText";
export { AnalysisMethod } from "./Entity/components/AnalysisMethod/analysisMethod";
export { AnalysisMethodsTitle } from "./Entity/components/AnalysisMethodsTitle/analysisMethodsTitle";
export { AnalysisPortals } from "./Entity/components/AnalysisPortals/analysisPortals";
export { DetailViewHero } from "./Layout/components/Detail/components/DetailViewHero/detailViewHero";
export { GridPaperSection } from "./Layout/components/Detail/components/Section/section.styles";
Expand Down
90 changes: 90 additions & 0 deletions app/utils/galaxy-api.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import ky from "ky";
import { WORKFLOW_ID } from "../apis/catalog/brc-analytics-catalog/common/entities";

interface WorkflowLandingsBody {
public: true;
request_state: WorkflowLandingsBodyRequestState;
workflow_id: string;
workflow_target_type: "trs_url";
}

type WorkflowLandingsBodyRequestState = {
[key: string]: { [key: string]: string } | string;
};

interface WorkflowLanding {
uuid: string;
}

const WORKFLOW_LANDINGS_API_URL =
"https://test.galaxyproject.org/api/workflow_landings";

const WORKFLOW_LANDING_URL_PREFIX =
"https://test.galaxyproject.org/workflow_landings/";

/**
* Get the URL of the workflow landing page for the given genome workflow.
* @param workflowId - Value for the `workflow_id` parameter sent to the API.
* @param referenceGenome - Genome version/assembly ID.
* @param geneModelUrl - URL for gene model parameter sent to the API.
* @returns workflow landing URL.
*/
export async function getWorkflowLandingUrl(
workflowId: WORKFLOW_ID,
referenceGenome: string,
geneModelUrl: string
): Promise<string> {
const body: WorkflowLandingsBody = {
public: true,
request_state: getWorkflowLandingsRequestState(
workflowId,
referenceGenome,
geneModelUrl
),
workflow_id: workflowId,
workflow_target_type: "trs_url",
};
const res = await ky.post<WorkflowLanding>(WORKFLOW_LANDINGS_API_URL, {
json: body,
retry: {
methods: ["post"],
},
});
const id = (await res.json()).uuid;
return `${WORKFLOW_LANDING_URL_PREFIX}${encodeURIComponent(id)}?public=true`;
}

function buildFastaUrl(identifier: string): string {
const baseUrl = "https://hgdownload.soe.ucsc.edu/hubs/";
const parts = identifier.split("_");
const formattedPath = `${parts[0]}/${parts[1].slice(0, 3)}/${parts[1].slice(
3,
6
)}/${parts[1].slice(6, 9)}/${identifier}/${identifier}.fa.gz`;
return `${baseUrl}${formattedPath}`;
}

/**
* Get the appropriate `request_state` object for the given workflow ID and reference genome.
* @param workflowId - Workflow ID.
* @param referenceGenome - Reference genome.
* @param geneModelUrl - URL for gene model parameter.
* @returns `request_state` value for the workflow landings request body.
*/
function getWorkflowLandingsRequestState(
workflowId: WORKFLOW_ID,
referenceGenome: string,
geneModelUrl: string
): WorkflowLandingsBodyRequestState {
if (workflowId === WORKFLOW_ID.VARIANT_CALLING && geneModelUrl) {
return {
"Annotation GTF": { ext: "gtf.gz", src: "url", url: geneModelUrl },
"Genome fasta": {
ext: "fasta.gz",
src: "url",
url: buildFastaUrl(referenceGenome),
},
};
}
return { reference_genome: referenceGenome };
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ import {
import { ViewContext } from "@databiosphere/findable-ui/lib/config/entities";
import { ComponentProps } from "react";
import { ROUTES } from "../../../../../routes/constants";
import { BRCDataCatalogGenome } from "../../../../apis/catalog/brc-analytics-catalog/common/entities";
import {
ANALYSIS_METHOD,
BRCDataCatalogGenome,
} from "../../../../apis/catalog/brc-analytics-catalog/common/entities";
import * as C from "../../../../components";
import { GENOME_BROWSER } from "./constants";

Expand Down Expand Up @@ -62,17 +65,25 @@ export const buildContigs = (
* @param cardProps - Card properties.
* @param cardProps.text - Card text.
* @param cardProps.title - Card title.
* @param cardProps.url - Card url.
* @param cardProps.analysisMethod - Analysis method.
* @returns Props to be used for the AnalysisMethod component.
*/
export const buildGenomeAnalysisMethod = (
genome: BRCDataCatalogGenome,
{ text, title, url }: Partial<CardProps> & { url: string }
{
analysisMethod,
text,
title,
}: Partial<CardProps> & {
analysisMethod: ANALYSIS_METHOD;
}
): ComponentProps<typeof C.AnalysisMethod> => {
return {
analysisMethod,
geneModelUrl: genome.geneModelUrl,
genomeVersionAssemblyId: genome.genomeVersionAssemblyId,
text,
title,
url,
};
};

Expand Down
1 change: 1 addition & 0 deletions files/build-catalog.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ async function buildGenomes(): Promise<BRCDataCatalogGenome[]> {
(row): BRCDataCatalogGenome => ({
chromosomes: parseNumber(row.Chromosomes),
contigs: parseNumber(row.Contigs),
geneModelUrl: row.geneModelUrl,
genomeVersionAssemblyId: row["Genome Version/Assembly ID"],
ncbiTaxonomyId: row.taxId,
organism: row.Organism,
Expand Down
41 changes: 39 additions & 2 deletions files/build-genomes-files.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
import re
import requests
import urllib.parse

GENOMES_SOURCE_URL = "https://docs.google.com/spreadsheets/d/1NRfTvebPl6zJ0l9tCqBtq6YCrwV6_XDBlheq3L5HcvQ/gviz/tq?tqx=out:csv&sheet=GenomeDataTypes_Summary.csv"
ASSEMBLIES_URL = "https://hgdownload.soe.ucsc.edu/hubs/BRC/assemblyList.json"
Expand All @@ -14,14 +15,48 @@ def get_duplicate_ids(genomes_df):
def get_unmatched_assemblies(assemblies_df, result_df):
return set(assemblies_df["asmId"]) - set(result_df["asmId"])

def _id_to_gene_model_url(asm_id):
hubs_url = "https://hgdownload.soe.ucsc.edu/hubs/"
components = [asm_id[0:3], asm_id[4:7], asm_id[7:10], asm_id[10:13], asm_id, "genes"]
url = urllib.parse.urljoin(hubs_url, "/".join(components))
# url looks something like https://hgdownload.soe.ucsc.edu/hubs/GCF/030/504/385/GCF_030504385.1/genes/
# and contains html content with links to gene models.
# we need to scrape this to get the gtf
print(f"fetching url {url}")
response = requests.get(url)
try:
response.raise_for_status()
except Exception:
# FIXME?: Some accessions don't have a gene folder
return None
# find link to gtf, should ideally be ncbiRefSeq, but augustus will do
html_content = response.text
pattern = rf"{asm_id.replace('.', r'\.')}.*?\.gtf\.gz"
augustus_file = None
for match in re.findall(pattern, html_content):
if "ncbiRefSeq" in match:
return urllib.parse.urljoin(f"{url}/", match)
elif "augustus" in match:
augustus_file = match
if augustus_file:
return urllib.parse.urljoin(f"{url}/", augustus_file)
# No match, I guess that's OK ?
return None


def add_gene_model_url(result_df: pd.DataFrame):
"https://hgdownload.soe.ucsc.edu/hubs/GCF/001/189/475/GCF_001189475.1"
result_df["geneModelUrl"] = result_df["Genome Version/Assembly ID"].apply(_id_to_gene_model_url)


def build_genomes_files():
print("Building files")

genomes_source_df = pd.read_csv(GENOMES_SOURCE_URL, keep_default_na=False, usecols=lambda name: re.fullmatch(r"Unnamed: \d+", name) is None)
assemblies_df = pd.DataFrame(requests.get(ASSEMBLIES_URL).json()["data"])

duplicate_ids = get_duplicate_ids(genomes_source_df)
print(f"Removing rows with duplicate Genome Version/Assembly ID values of: {", ".join(duplicate_ids)}")
print(f"Removing rows with duplicate Genome Version/Assembly ID values of: {', '.join(duplicate_ids)}")

deduped_genomes_df = genomes_source_df.drop_duplicates(subset=["Genome Version/Assembly ID"])

Expand All @@ -30,9 +65,11 @@ def build_genomes_files():

result_df = gen_bank_merge_df.combine_first(ref_seq_merge_df).dropna(subset=["ucscBrowser"])

add_gene_model_url(result_df)

unmatched_assemblies = get_unmatched_assemblies(assemblies_df, result_df)
if (len(unmatched_assemblies) != 0):
print(f"Omitted {len(unmatched_assemblies)} assemblies that had no matches: {", ".join(unmatched_assemblies)}")
print(f"Omitted {len(unmatched_assemblies)} assemblies that had no matches: {', '.join(unmatched_assemblies)}")

result_df["taxId"] = result_df["taxId"].astype(int)

Expand Down
1 change: 1 addition & 0 deletions files/entities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ export interface SourceGenome {
comName: string;
Contigs: string;
genBank: string;
geneModelUrl: string;
"Genome Source": string;
"Genome Version/Assembly ID": string;
identical: string;
Expand Down
Loading

0 comments on commit 3871f97

Please sign in to comment.