Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add variant calling workflow, use URL input for GTF and fasta references #150

Merged
merged 15 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions app/apis/catalog/brc-analytics-catalog/common/constants.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import { ANALYSIS_METHOD, WORKFLOW_ID } from "./entities";

export const WORKFLOW_IDS_BY_ANALYSIS_METHOD: Partial<
Record<ANALYSIS_METHOD, WORKFLOW_ID>
> = {
[ANALYSIS_METHOD.REGULATION]: WORKFLOW_ID.REGULATION,
[ANALYSIS_METHOD.TRANSCRIPTOMICS]: WORKFLOW_ID.TRANSCRIPTOMICS,
[ANALYSIS_METHOD.VARIANT_CALLING]: WORKFLOW_ID.VARIANT_CALLING,
};
16 changes: 16 additions & 0 deletions app/apis/catalog/brc-analytics-catalog/common/entities.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
export enum ANALYSIS_METHOD {
ASSEMBLY = "ASSEMBLY",
GENOME_COMPARISONS = "GENOME_COMPARISONS",
PROTEIN_FOLDING = "PROTEIN_FOLDING",
REGULATION = "REGULATION",
TRANSCRIPTOMICS = "TRANSCRIPTOMICS",
VARIANT_CALLING = "VARIANT_CALLING",
}

export type BRCCatalog = BRCDataCatalogGenome;

export interface BRCDataCatalogGenome {
chromosomes: number;
contigs: number;
geneModelUrl: string;
genomeVersionAssemblyId: string;
ncbiTaxonomyId: string;
organism: string;
Expand All @@ -25,3 +35,9 @@ export interface EntitiesResponsePagination {
size: number;
total: number;
}

export enum WORKFLOW_ID {
REGULATION = "https://dockstore.org/api/ga4gh/trs/v2/tools/#workflow/github.com/iwc-workflows/chipseq-pe/main/versions/v0.12",
TRANSCRIPTOMICS = "https://dockstore.org/api/ga4gh/trs/v2/tools/#workflow/github.com/iwc-workflows/rnaseq-pe/main/versions/v0.9",
VARIANT_CALLING = "https://dockstore.org/api/ga4gh/trs/v2/tools/#workflow/github.com/iwc-workflows/haploid-variant-calling-wgs-pe/main/versions/v0.1",
}
30 changes: 25 additions & 5 deletions app/components/Entity/components/AnalysisMethod/analysisMethod.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,32 @@ import {
ANCHOR_TARGET,
REL_ATTRIBUTE,
} from "@databiosphere/findable-ui/lib/components/Links/common/entities";
import { useAsync } from "@databiosphere/findable-ui/src/hooks/useAsync";
import { Card } from "@mui/material";
import { WORKFLOW_IDS_BY_ANALYSIS_METHOD } from "app/apis/catalog/brc-analytics-catalog/common/constants";
import { getWorkflowLandingUrl } from "app/utils/galaxy-api";
import { ANALYSIS_METHOD } from "../../../../apis/catalog/brc-analytics-catalog/common/entities";
import {
StyledButtonPrimary,
StyledCardContent,
} from "./analysisMethod.styles";

export interface AnalysisMethodProps extends CardProps {
url: string;
analysisMethod: ANALYSIS_METHOD;
geneModelUrl: string;
genomeVersionAssemblyId: string;
}

export const AnalysisMethod = ({
analysisMethod,
geneModelUrl,
genomeVersionAssemblyId,
Paper = FluidPaper,
text,
title,
url,
}: AnalysisMethodProps): JSX.Element => {
const workflowId = WORKFLOW_IDS_BY_ANALYSIS_METHOD[analysisMethod];
const { data: landingUrl, isLoading, run } = useAsync<string>();
return (
<Card component={Paper}>
<CardSection>
Expand All @@ -31,16 +41,26 @@ export const AnalysisMethod = ({
<CardText>{text}</CardText>
</StyledCardContent>
<StyledButtonPrimary
disabled={!url}
onClick={(): void => {
disabled={!workflowId || isLoading}
onClick={async (): Promise<void> => {
if (!workflowId) return;
const url =
landingUrl ??
(await run(
getWorkflowLandingUrl(
workflowId,
genomeVersionAssemblyId,
geneModelUrl
)
));
window.open(
url,
ANCHOR_TARGET.BLANK,
REL_ATTRIBUTE.NO_OPENER_NO_REFERRER
);
}}
>
Analyze
{isLoading ? "Loading..." : "Analyze"}
</StyledButtonPrimary>
</CardSection>
</Card>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import { TEXT_HEADING } from "@databiosphere/findable-ui/lib/theme/common/typography";
import { Typography } from "@mui/material";

interface AnalysisMethodsTitleProps {
title: React.ReactNode;
}

export const AnalysisMethodsTitle = ({
title,
}: AnalysisMethodsTitleProps): JSX.Element => {
return (
<Typography color="ink.main" component="h2" variant={TEXT_HEADING}>
{title}
</Typography>
);
};
1 change: 1 addition & 0 deletions app/components/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ export { Link } from "@databiosphere/findable-ui/lib/components/Links/components
export { BasicCell } from "@databiosphere/findable-ui/lib/components/Table/components/TableCell/components/BasicCell/basicCell";
export { CopyText } from "./common/CopyText/copyText";
export { AnalysisMethod } from "./Entity/components/AnalysisMethod/analysisMethod";
export { AnalysisMethodsTitle } from "./Entity/components/AnalysisMethodsTitle/analysisMethodsTitle";
export { AnalysisPortals } from "./Entity/components/AnalysisPortals/analysisPortals";
export { DetailViewHero } from "./Layout/components/Detail/components/DetailViewHero/detailViewHero";
export { GridPaperSection } from "./Layout/components/Detail/components/Section/section.styles";
Expand Down
90 changes: 90 additions & 0 deletions app/utils/galaxy-api.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import ky from "ky";
import { WORKFLOW_ID } from "../apis/catalog/brc-analytics-catalog/common/entities";

interface WorkflowLandingsBody {
public: true;
request_state: WorkflowLandingsBodyRequestState;
workflow_id: string;
workflow_target_type: "trs_url";
}

type WorkflowLandingsBodyRequestState = {
[key: string]: { [key: string]: string } | string;
};

interface WorkflowLanding {
uuid: string;
}

const WORKFLOW_LANDINGS_API_URL =
"https://test.galaxyproject.org/api/workflow_landings";

const WORKFLOW_LANDING_URL_PREFIX =
"https://test.galaxyproject.org/workflow_landings/";

/**
* Get the URL of the workflow landing page for the given genome workflow.
* @param workflowId - Value for the `workflow_id` parameter sent to the API.
* @param referenceGenome - Genome version/assembly ID.
* @param geneModelUrl - URL for gene model parameter sent to the API.
* @returns workflow landing URL.
*/
export async function getWorkflowLandingUrl(
workflowId: WORKFLOW_ID,
referenceGenome: string,
geneModelUrl: string
): Promise<string> {
const body: WorkflowLandingsBody = {
public: true,
request_state: getWorkflowLandingsRequestState(
workflowId,
referenceGenome,
geneModelUrl
),
workflow_id: workflowId,
workflow_target_type: "trs_url",
};
const res = await ky.post<WorkflowLanding>(WORKFLOW_LANDINGS_API_URL, {
json: body,
retry: {
methods: ["post"],
},
});
const id = (await res.json()).uuid;
return `${WORKFLOW_LANDING_URL_PREFIX}${encodeURIComponent(id)}?public=true`;
}

function buildFastaUrl(identifier: string): string {
const baseUrl = "https://hgdownload.soe.ucsc.edu/hubs/";
const parts = identifier.split("_");
const formattedPath = `${parts[0]}/${parts[1].slice(0, 3)}/${parts[1].slice(
3,
6
)}/${parts[1].slice(6, 9)}/${identifier}/${identifier}.fa.gz`;
return `${baseUrl}${formattedPath}`;
}

/**
* Get the appropriate `request_state` object for the given workflow ID and reference genome.
* @param workflowId - Workflow ID.
* @param referenceGenome - Reference genome.
* @param geneModelUrl - URL for gene model parameter.
* @returns `request_state` value for the workflow landings request body.
*/
function getWorkflowLandingsRequestState(
workflowId: WORKFLOW_ID,
referenceGenome: string,
geneModelUrl: string
): WorkflowLandingsBodyRequestState {
if (workflowId === WORKFLOW_ID.VARIANT_CALLING && geneModelUrl) {
return {
"Annotation GTF": { ext: "gtf.gz", src: "url", url: geneModelUrl },
"Genome fasta": {
ext: "fasta.gz",
src: "url",
url: buildFastaUrl(referenceGenome),
},
};
}
return { reference_genome: referenceGenome };
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ import {
import { ViewContext } from "@databiosphere/findable-ui/lib/config/entities";
import { ComponentProps } from "react";
import { ROUTES } from "../../../../../routes/constants";
import { BRCDataCatalogGenome } from "../../../../apis/catalog/brc-analytics-catalog/common/entities";
import {
ANALYSIS_METHOD,
BRCDataCatalogGenome,
} from "../../../../apis/catalog/brc-analytics-catalog/common/entities";
import * as C from "../../../../components";
import { GENOME_BROWSER } from "./constants";

Expand Down Expand Up @@ -62,17 +65,25 @@ export const buildContigs = (
* @param cardProps - Card properties.
* @param cardProps.text - Card text.
* @param cardProps.title - Card title.
* @param cardProps.url - Card url.
* @param cardProps.analysisMethod - Analysis method.
* @returns Props to be used for the AnalysisMethod component.
*/
export const buildGenomeAnalysisMethod = (
genome: BRCDataCatalogGenome,
{ text, title, url }: Partial<CardProps> & { url: string }
{
analysisMethod,
text,
title,
}: Partial<CardProps> & {
analysisMethod: ANALYSIS_METHOD;
}
): ComponentProps<typeof C.AnalysisMethod> => {
return {
analysisMethod,
geneModelUrl: genome.geneModelUrl,
genomeVersionAssemblyId: genome.genomeVersionAssemblyId,
text,
title,
url,
};
};

Expand Down
1 change: 1 addition & 0 deletions files/build-catalog.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ async function buildGenomes(): Promise<BRCDataCatalogGenome[]> {
(row): BRCDataCatalogGenome => ({
chromosomes: parseNumber(row.Chromosomes),
contigs: parseNumber(row.Contigs),
geneModelUrl: row.geneModelUrl,
genomeVersionAssemblyId: row["Genome Version/Assembly ID"],
ncbiTaxonomyId: row.taxId,
organism: row.Organism,
Expand Down
41 changes: 39 additions & 2 deletions files/build-genomes-files.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
import re
import requests
import urllib.parse

GENOMES_SOURCE_URL = "https://docs.google.com/spreadsheets/d/1NRfTvebPl6zJ0l9tCqBtq6YCrwV6_XDBlheq3L5HcvQ/gviz/tq?tqx=out:csv&sheet=GenomeDataTypes_Summary.csv"
ASSEMBLIES_URL = "https://hgdownload.soe.ucsc.edu/hubs/BRC/assemblyList.json"
Expand All @@ -14,14 +15,48 @@ def get_duplicate_ids(genomes_df):
def get_unmatched_assemblies(assemblies_df, result_df):
return set(assemblies_df["asmId"]) - set(result_df["asmId"])

def _id_to_gene_model_url(asm_id):
hubs_url = "https://hgdownload.soe.ucsc.edu/hubs/"
components = [asm_id[0:3], asm_id[4:7], asm_id[7:10], asm_id[10:13], asm_id, "genes"]
url = urllib.parse.urljoin(hubs_url, "/".join(components))
# url looks something like https://hgdownload.soe.ucsc.edu/hubs/GCF/030/504/385/GCF_030504385.1/genes/
# and contains html content with links to gene models.
# we need to scrape this to get the gtf
print(f"fetching url {url}")
response = requests.get(url)
try:
response.raise_for_status()
except Exception:
# FIXME?: Some accessions don't have a gene folder
return None
# find link to gtf, should ideally be ncbiRefSeq, but augustus will do
html_content = response.text
pattern = rf"{asm_id.replace('.', r'\.')}.*?\.gtf\.gz"
augustus_file = None
for match in re.findall(pattern, html_content):
if "ncbiRefSeq" in match:
return urllib.parse.urljoin(f"{url}/", match)
elif "augustus" in match:
augustus_file = match
if augustus_file:
return urllib.parse.urljoin(f"{url}/", augustus_file)
# No match, I guess that's OK ?
return None


def add_gene_model_url(result_df: pd.DataFrame):
"https://hgdownload.soe.ucsc.edu/hubs/GCF/001/189/475/GCF_001189475.1"
result_df["geneModelUrl"] = result_df["Genome Version/Assembly ID"].apply(_id_to_gene_model_url)


def build_genomes_files():
print("Building files")

genomes_source_df = pd.read_csv(GENOMES_SOURCE_URL, keep_default_na=False, usecols=lambda name: re.fullmatch(r"Unnamed: \d+", name) is None)
assemblies_df = pd.DataFrame(requests.get(ASSEMBLIES_URL).json()["data"])

duplicate_ids = get_duplicate_ids(genomes_source_df)
print(f"Removing rows with duplicate Genome Version/Assembly ID values of: {", ".join(duplicate_ids)}")
print(f"Removing rows with duplicate Genome Version/Assembly ID values of: {', '.join(duplicate_ids)}")

deduped_genomes_df = genomes_source_df.drop_duplicates(subset=["Genome Version/Assembly ID"])

Expand All @@ -30,9 +65,11 @@ def build_genomes_files():

result_df = gen_bank_merge_df.combine_first(ref_seq_merge_df).dropna(subset=["ucscBrowser"])

add_gene_model_url(result_df)

unmatched_assemblies = get_unmatched_assemblies(assemblies_df, result_df)
if (len(unmatched_assemblies) != 0):
print(f"Omitted {len(unmatched_assemblies)} assemblies that had no matches: {", ".join(unmatched_assemblies)}")
print(f"Omitted {len(unmatched_assemblies)} assemblies that had no matches: {', '.join(unmatched_assemblies)}")

result_df["taxId"] = result_df["taxId"].astype(int)

Expand Down
1 change: 1 addition & 0 deletions files/entities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ export interface SourceGenome {
comName: string;
Contigs: string;
genBank: string;
geneModelUrl: string;
"Genome Source": string;
"Genome Version/Assembly ID": string;
identical: string;
Expand Down
Loading
Loading