daily_arxiv.py

import datetime
import requests
import json
import arxiv
import os

base_url = "https://arxiv.paperswithcode.com/api/v0/papers/"


def get_authors(authors, first_author=False):
    output = str()
    if first_author == False:
        output = ", ".join(str(author) for author in authors)
    else:
        output = authors[0]
    return output


def sort_papers(papers):
    output = dict()
    keys = list(papers.keys())
    keys.sort(reverse=True)
    for key in keys:
        output[key] = papers[key]
    return output


def get_daily_papers(query="slam", max_results=2):
    """
    @param query: str
    @return paper_with_code: dict
    """

    # output
    content = dict()

    search_engine = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )

    cnt = 0

    for result in search_engine.results():

        paper_id = result.get_short_id()
        paper_title = result.title
        paper_url = result.entry_id

        # None if not existed
        comment = '~'
        if result.comment:
            comment = result.comment.replace(
                '\r', '').replace('\n', '').replace('\t', '')

        code_url = base_url + paper_id

        publish_time = result.published.date()

        print("Time = ", publish_time,
              " title = ", paper_title,
              ' comment = ', comment)

        # eg: 2108.09112v1 -> 2108.09112
        ver_pos = paper_id.find('v')
        if ver_pos == -1:
            paper_key = paper_id
        else:
            paper_key = paper_id[0:ver_pos]

        try:
            r = requests.get(code_url).json()
            # source code link
            if "official" in r and r["official"]:
                cnt += 1
                repo_url = r["official"]["url"]
                content[paper_key] = f"|**{publish_time}**|**[{paper_title}]({paper_url})**|{comment}|[code]({repo_url})|\n"
            else:
                content[paper_key] = f"|**{publish_time}**|**[{paper_title}]({paper_url})**|{comment}|~|\n"

        except Exception as e:
            print(f"exception: {e} with id: {paper_key}")
    return content


def update_json_file(filename, content):
    
    if not os.path.exists(filename):
        with open(filename, "w+") as f:
            if init == True:
                f.write('')

    with open(filename, "r") as f:
        temp = f.read()
        if not temp:
            json_data = {}
        else:
            json_data = json.loads(temp)

    # json_data = m.copy()

    if topic in json_data.keys():
        json_data[topic].update(content)
    else:
        json_data[topic] = content

    with open(filename, "w") as f:
        json.dump(json_data, f)


def json_to_md(filename):
    """
    @param filename: str
    @return None
    """

    md_filename = './README.md'

    DateNow = datetime.date.today()
    DateNow = str(DateNow)
    DateNow = DateNow.replace('-', '.')

    with open(filename, "r") as f:
        content = f.read()
        if not content:
            data = {}
        else:
            data = json.loads(content)

    # clean README.md if daily already exist else create it
    with open(md_filename,"w+") as f:
        pass

    # clean README.md if daily already exist else create it
    with open(md_filename, "a+") as f:
        
        f.write("| Date | Title | Comment | Code |\n")
        f.write("|:-----|:------|:--------|:-----|\n")
        for keyword in data.keys():
            day_content = data[keyword]
            if not day_content:
                continue

            # sort papers by date
            day_content = sort_papers(day_content)

            for _, v in day_content.items():
                if v is not None:
                    f.write(v)
        f.close()


if __name__ == "__main__":

    init = False
    demo = False
    num_result = 20
    if init:
        num_result = 50
    if demo:
        num_result = 2
    data_collector = []
    data_collector_web = []

    json_file = 'daily-arxiv.json'

    keywords = {
#         'fl': "abs:federated\ learning",
#         'da': "abs:'domain\ adaptation'",
#         'dg': "abs:'domain\ generalization'",
        'gan': "abs:generative\ adversarial\ network",
        'diffusion': "abs:diffusion\ model",
#         'poi': "abs:poison+attack",
        'code':'co:code',
    }

    content={}
    for topic, keyword in keywords.items():
        content.update(get_daily_papers(query=keyword, max_results=num_result))
            
    update_json_file(json_file, content)
    json_to_md(json_file)
    print('finished')