Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
eidens committed May 13, 2024
2 parents 77e67ce + d6e066d commit 88da5b3
Show file tree
Hide file tree
Showing 6 changed files with 366 additions and 146 deletions.
16 changes: 16 additions & 0 deletions docker-compose.tests.yml
Original file line number Diff line number Diff line change
@@ -1,16 +1,32 @@
services:
flask:
depends_on:
neo4j:
condition: service_healthy
redis:
condition: service_healthy
env_file:
- ./.env.ci

neo4j:
env_file:
- ./.env.ci
healthcheck:
test: cypher-shell -u ${NEO_USERNAME} -p ${NEO_PASSWORD} --non-interactive "MATCH (n) RETURN ID(n) LIMIT 1"
# wait up to 10 minutes for neo4j to be ready
interval: 10s
timeout: 10s
retries: 60
ports:
- 7474:7474
- 7687:7687
volumes:
- ./tests/data/neo4j-data:/data
- ./tests/data/neo4j-logs:/logs
- ./tests/data/neo4j-plugins:/plugins

redis:
healthcheck:
test: redis-cli --raw incr ping
volumes:
- ./tests/data/redis-data:/data
122 changes: 114 additions & 8 deletions peerreview/published.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
from argparse import ArgumentParser
from pandas import DataFrame, concat, read_csv, to_datetime
from tqdm import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm
import common.logging
from peerreview.neohypo import BioRxiv, CrossRefDOI
from sdg.sdnode import API
from . import DB
from .queries import (
NotYetPublished, UpdatePublicationStatus,
)

logger = common.logging.get_logger(__name__)


def _date_from_parts(parts):
if len(parts) < 1 or len(parts) > 3:
return None
if len(parts) == 1:
parts.append(1)
parts.append(1)
if len(parts) == 2:
parts.append(1)
return f"{parts[0]}-{parts[1]}-{parts[2]}"


class PublicationUpdate:

def __init__(self, db):
Expand All @@ -23,40 +37,132 @@ def get_not_published(self, limit_date: str):
return dois

def check_publication_status(self, preprint_doi):
published_doi = self.biorxiv.details(preprint_doi).get('published', None)
return published_doi
raise NotImplementedError()

def update_status(self, preprint_doi, published_doi):
cross_ref_metadata = self.crossref.details(published_doi)
published_journal_title = cross_ref_metadata.get('container-title', '')
published_date = _date_from_parts(cross_ref_metadata.get('published', {}).get('date-parts', [[]])[0])

params = {
'preprint_doi': preprint_doi,
'published_doi': published_doi,
'published_journal_title': published_journal_title,
'published_date': published_date,
}
update_published_status = UpdatePublicationStatus(params=params)
self.db.query(update_published_status)
return published_journal_title
return published_journal_title, published_date

def run(self, limit_date: str):
not_yet_published = self.get_not_published(limit_date)
logger.info(f"{len(not_yet_published)} preprints posted since {limit_date} with no journal publication info yet.")
msg = ''
N = len(not_yet_published)
with logging_redirect_tqdm():
for preprint_doi in tqdm(not_yet_published):
published_doi = self.check_publication_status(preprint_doi)
if (published_doi is not None) and (published_doi != "NA"):
journal = self.update_status(preprint_doi, published_doi)
logger.info(f"{preprint_doi} --> {published_doi} in {journal}")
journal, date = self.update_status(preprint_doi, published_doi)
logger.info(f"{preprint_doi} --> {published_doi} in {journal} on {date}")


class BiorxivPubUpdate(PublicationUpdate):
def check_publication_status(self, preprint_doi):
published_doi = self.biorxiv.details(preprint_doi).get('published', None)
return published_doi


class CrossRefPreprintApi(API):
def __init__(self):
super().__init__()
self.url = 'https://api.crossref.org/types/posted-content/works'
self._cache_file = 'crossref_preprints.csv'
self._data = None

def _published_preprints(self):
if self._data is None:
try:
data = read_csv(self._cache_file)
latest_index_date = to_datetime(data['index_date'].max()).strftime('%Y-%m-%d')
except FileNotFoundError:
data = None
latest_index_date = None

new_data = self._new_data(latest_index_date)
if data is None:
self._data = new_data
else:
self._data = (
concat([data, new_data])
.drop_duplicates(subset='preprint_doi', keep='last')
.sort_values(['index_date', 'preprint_doi'], ascending=False)
.reset_index(drop=True)
)

self._data.to_csv(self._cache_file, index=False)
return self._data

def _new_data(self, latest_index_date):
params = {
'filter': 'relation.type:is-preprint-of',
'select': 'DOI,indexed,relation',
'rows': 1000,
'cursor': '*',
}
if latest_index_date:
params['filter'] += f',from-index-date:{latest_index_date}'

new_data = []
n_items_fetched = 0
while True:
message = self.rest2data(self.url, params).get('message', {})
items = message.get('items', [])
if not items:
break

n_items_fetched += len(items)
logger.info(f"Fetched {n_items_fetched}/{message['total-results']} items from CrossRef API.")

for item in items:
relation = item['relation']['is-preprint-of'][0]
if relation['id-type'] != 'doi':
continue
preprint_doi = item['DOI']
published_doi = relation['id']
indexed_date = _date_from_parts(item['indexed']['date-parts'][0])
new_data.append({
'preprint_doi': preprint_doi,
'published_doi': published_doi,
'index_date': indexed_date,
})

next_cursor = message.get('next-cursor', None)
params['cursor'] = next_cursor

return DataFrame(new_data)


class CrossRefPubUpdate(PublicationUpdate):
def __init__(self, db):
super().__init__(db)
self.crossref_preprints = CrossRefPreprintApi()

def check_publication_status(self, preprint_doi):
data = self.crossref_preprints._published_preprints()
published_dois = data.loc[data['preprint_doi'] == preprint_doi, 'published_doi']
if len(published_dois) == 0:
return None
elif len(published_dois) > 1:
logger.warning(f"Multiple published DOIs for preprint {preprint_doi}: {published_dois}")
return published_dois.iat[0]


def main():
parser = ArgumentParser(description="Upload reviews linked to preprints and updates publication status of preprints.")
parser.add_argument('--limit-date', default='1900-01-01', help='Limit posting date: only preprints older than this limit date will be scanned for journal publication status.')
args = parser.parse_args()
limit_date = args.limit_date
PublicationUpdate(DB).run(limit_date=limit_date)
BiorxivPubUpdate(DB).run(limit_date=limit_date)
CrossRefPubUpdate(DB).run(limit_date=limit_date)


if __name__ == '__main__':
Expand Down
4 changes: 3 additions & 1 deletion peerreview/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@ class UpdatePublicationStatus(Query):
MATCH (a:Article {doi: $preprint_doi})
SET
a.journal_doi = $published_doi,
a.published_journal_title = $published_journal_title
a.published_journal_title = $published_journal_title,
a.published_date = $published_date
RETURN a
"""
map = {
"preprint_doi": [],
"published_doi": [],
"published_journal_title": [],
"published_date": [],
}
returns = ["a"]

Expand Down
26 changes: 0 additions & 26 deletions scripts/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,32 +50,6 @@ set +o allexport
echo "Starting docker-compose service"
eeb up -d flask

# waits until neo4j is ready by trying to execute a cypher query using the cypher-shell interface
wait_for_neo4j=$(
cat <<'END_HEREDOC'
t=0
until echo "MATCH(n) RETURN COUNT(n);" | cypher-shell -a $NEO_URI -u $NEO_USERNAME -p $NEO_PASSWORD;
do
if [[ $t -gt 60 ]]; then
echo "Neo4j not ready after 60 seconds, aborting"
exit 1
fi
t=$((t+1))
echo -ne "\r"
echo -ne "Waiting for Neo4j to be ready... ($t s)"
sleep 1
done
echo -ne "\r"
if [[ $t -gt 0 ]]; then
echo "Neo4j ready after $t seconds"
else
echo "Neo4j already up"
fi
END_HEREDOC
)
eeb exec --no-TTY neo4j bash -c "$wait_for_neo4j"

echo "Installing test dependencies"
eeb exec --no-TTY flask pip install pytest

Expand Down
Loading

0 comments on commit 88da5b3

Please sign in to comment.