Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AG-1117: Add new ensembl info fields to gene_info transform #95

Merged
merged 1 commit into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@
meanlogcpm: mean
quartile3logcpm: third_quartile
maximumlogcpm: max
possible_replacement: ensembl_possible_replacements
permalink: ensembl_permalink
provenance:
- syn25953363.6
- syn12514826.4
Expand Down
38 changes: 32 additions & 6 deletions src/agoradatatools/etl/transform/gene_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,33 +61,42 @@ def transform_gene_info(
druggability = druggability[useful_columns]

target_list = nest_fields(
df=target_list, grouping="ensembl_gene_id", new_column="target_nominations", drop_columns=["ensembl_gene_id"]
df=target_list,
grouping="ensembl_gene_id",
new_column="target_nominations",
drop_columns=["ensembl_gene_id"],
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: changes like this are from the black formatter, I didn't change anything functional.

)

median_expression = nest_fields(
df=median_expression, grouping="ensembl_gene_id", new_column="median_expression", drop_columns=["ensembl_gene_id"]
df=median_expression,
grouping="ensembl_gene_id",
new_column="median_expression",
drop_columns=["ensembl_gene_id"],
)

druggability = nest_fields(
df=druggability, grouping="ensembl_gene_id", new_column="druggability", drop_columns=["ensembl_gene_id"]
df=druggability,
grouping="ensembl_gene_id",
new_column="druggability",
drop_columns=["ensembl_gene_id"],
)

biodomains = (
biodomains.groupby("ensembl_gene_id")["biodomain"]
.apply(set) # ensure unique biodomain names
.apply(set) # ensure unique biodomain names
.apply(list)
.reset_index()
.rename(columns={"biodomain": "biodomains"})
)

# sort biodomains list alphabetically
biodomains['biodomains'] = biodomains['biodomains'].apply(sorted)
biodomains["biodomains"] = biodomains["biodomains"].apply(sorted)

# For genes with either is_adi or is_tep set to True, create a resource URL that opens
# the portal page to the specific gene. This must be done using the hgnc_symbol from the
# tep_info file and not the symbol in gene_info, because there are some mismatches
# between the two and the hgnc_symbol from tep_info is the correct one to use here.
# resource_url should be NA if both is_adi and is_tep are false.
# resource_url should be NA if both is_adi and is_tep are false.
resource_url_prefix = "https://adknowledgeportal.synapse.org/Explore/Target%20Enabling%20Resources?QueryWrapper0=%7B%22sql%22%3A%22select%20*%20from%20syn26146692%20WHERE%20%60isPublic%60%20%3D%20true%22%2C%22limit%22%3A25%2C%22offset%22%3A0%2C%22selectedFacets%22%3A%5B%7B%22concreteType%22%3A%22org.sagebionetworks.repo.model.table.FacetColumnValuesRequest%22%2C%22columnName%22%3A%22target%22%2C%22facetValues%22%3A%5B%22"
resource_url_suffix = "%22%5D%7D%5D%7D"
tep_info["resource_url"] = tep_info.apply(
Expand All @@ -97,6 +106,21 @@ def transform_gene_info(
axis=1,
)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Real changes I made start here.

ensembl_info = gene_metadata[
[
"ensembl_gene_id",
"ensembl_release",
"ensembl_possible_replacements",
"ensembl_permalink",
]
]
ensembl_info = nest_fields(
df=ensembl_info,
grouping="ensembl_gene_id",
new_column="ensembl_info",
drop_columns=["ensembl_gene_id"],
)

# Merge all the datasets
gene_info = gene_metadata

Expand All @@ -110,6 +134,7 @@ def transform_gene_info(
druggability,
biodomains,
tep_info,
ensembl_info,
]:
gene_info = pd.merge(
left=gene_info,
Expand Down Expand Up @@ -181,6 +206,7 @@ def transform_gene_info(
"is_adi",
"is_tep",
"resource_url",
"ensembl_info",
]
]

Expand Down
2 changes: 2 additions & 0 deletions test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@
meanlogcpm: mean
quartile3logcpm: third_quartile
maximumlogcpm: max
possible_replacement: ensembl_possible_replacements
permalink: ensembl_permalink
provenance:
- syn25953363.6
- syn12514826.4
Expand Down
Loading