sources.fetch_paper_metadata

  1import time
  2import arxiv
  3from datetime import datetime
  4from pathlib import Path
  5from typing import Sequence
  6
  7from config import data_dir, resources_dir
  8
  9_client = arxiv.Client(num_retries=1, delay_seconds=7.0)
 10
 11_RATELIMIT_WAIT = 60.0
 12
 13
 14def _vault_dir() -> Path:
 15    """Obsidian vault location — resolved off data_dir() so it tracks LINXIV_DATA_DIR
 16    like the DB and PDFs, instead of being pinned to the source tree. Called per-use
 17    (not cached at import) to avoid the import-time-frozen trap."""
 18    return data_dir() / "obsidian_vault" / "arXivVault"
 19
 20
 21def _ratelimit_file() -> Path:
 22    """arXiv rate-limit timestamp file — lives in data_dir() so it tracks LINXIV_DATA_DIR
 23    like the DB, PDFs, and vault. Resolved per-use (not cached at import)."""
 24    return data_dir() / ".arxiv_ratelimit"
 25
 26
 27def _check_ratelimit() -> None:
 28    path = _ratelimit_file()
 29    if not path.exists():
 30        return
 31    with open(path) as f:
 32        last = datetime.fromisoformat(f.read().strip())
 33    remaining = _RATELIMIT_WAIT - (datetime.now() - last).total_seconds()
 34    if remaining > 0:
 35        print(f"[arxiv] rate limited — waiting {remaining:.0f}s")
 36        time.sleep(remaining)
 37
 38
 39def _record_ratelimit() -> None:
 40    path = _ratelimit_file()
 41    path.parent.mkdir(parents=True, exist_ok=True)
 42    with open(path, "w") as f:
 43        f.write(datetime.now().isoformat())
 44
 45
 46def _arxiv_call(fn):
 47    _check_ratelimit()
 48    try:
 49        return fn()
 50    except Exception as e:
 51        if "429" in str(e):
 52            _record_ratelimit()
 53            print("[arxiv] 429 received — recorded. Retry your search in 60s.")
 54        raise
 55
 56def fetch_paper_metadata(source_id: str, print_on: bool = False) -> arxiv.Result:
 57    search = arxiv.Search(id_list=[source_id])
 58    paper = _arxiv_call(lambda: next(_client.results(search)))
 59
 60    if print_on:
 61        print(f"Title:    {paper.title}")
 62        print(f"Date:     {paper.published.strftime('%Y-%m-%d')}")
 63        print(f"Authors:  {', '.join(author.name for author in paper.authors)}")
 64        print(f"Category: {paper.primary_category}")
 65        print(f"DOI:      {paper.doi}")
 66        print(f"PDF URL:  {paper.pdf_url}")
 67        print("-" * 30)
 68        print(f"Abstract:\n{paper.summary}")
 69
 70    return paper
 71
 72def search_papers(
 73    query: str,
 74    max_results: int = 10,
 75    sort_by: arxiv.SortCriterion = arxiv.SortCriterion.Relevance,
 76    sort_order: arxiv.SortOrder = arxiv.SortOrder.Descending,
 77    print_on: bool = False,
 78) -> list[arxiv.Result]:
 79    search = arxiv.Search(query=query, max_results=max_results, sort_by=sort_by, sort_order=sort_order)
 80    papers = _arxiv_call(lambda: list(_client.results(search)))
 81
 82    if print_on:
 83        for paper in papers:
 84            print(f"Title:    {paper.title}")
 85            print(f"Date:     {paper.published.strftime('%Y-%m-%d')}")
 86            print(f"Authors:  {', '.join(author.name for author in paper.authors)}")
 87            print(f"Category: {paper.primary_category}")
 88            print("-" * 30)
 89
 90    return papers
 91
 92def gen_md_files(papers: list[arxiv.Result], additional_tags: None | Sequence[str] = None) -> None:
 93    for paper in papers:
 94        gen_md_file(paper, additional_tags=additional_tags)
 95
 96def gen_md_file(paper: arxiv.Result, additional_tags: None | Sequence[str] = None, print_on: bool = False):
 97    title: str = paper.title
 98    bare_id: str = paper.entry_id.split('/')[-1]
 99    source_id: str = f"arxiv:{bare_id}"
100    url: str = f"https://arxiv.org/abs/{bare_id}"
101    authors: list[str] = [author.name for author in paper.authors]
102    tags: list[str] = ["clippings", "research", "clipping"]
103
104    if additional_tags:
105        for s in additional_tags:
106            tags.append(s)
107
108    date = paper.published.strftime('%Y-%m-%d')
109    vault = _vault_dir()
110    vault.mkdir(parents=True, exist_ok=True)
111    filename = vault / f"{bare_id}.md"
112
113    author_list = "\n".join([f'  - "[[{name}]]"' for name in authors])
114    tag_list = "\n".join([f'- {tag}' for tag in tags])
115    with open(resources_dir() / "formats" / "table_format.md", "r", encoding="utf-8") as f:
116        template = f.read()
117        final_content = template.format(
118            title=title,
119            url=url,
120            author_list=author_list,
121            date=date,
122            tag_list=tag_list
123        )
124    if print_on:
125        print(final_content)
126    with open(filename, "w", encoding="utf-8") as f:
127        f.write(final_content)
def fetch_paper_metadata(source_id: str, print_on: bool = False) -> arxiv.Result:
57def fetch_paper_metadata(source_id: str, print_on: bool = False) -> arxiv.Result:
58    search = arxiv.Search(id_list=[source_id])
59    paper = _arxiv_call(lambda: next(_client.results(search)))
60
61    if print_on:
62        print(f"Title:    {paper.title}")
63        print(f"Date:     {paper.published.strftime('%Y-%m-%d')}")
64        print(f"Authors:  {', '.join(author.name for author in paper.authors)}")
65        print(f"Category: {paper.primary_category}")
66        print(f"DOI:      {paper.doi}")
67        print(f"PDF URL:  {paper.pdf_url}")
68        print("-" * 30)
69        print(f"Abstract:\n{paper.summary}")
70
71    return paper
def search_papers( query: str, max_results: int = 10, sort_by: arxiv.SortCriterion = <SortCriterion.Relevance: 'relevance'>, sort_order: arxiv.SortOrder = <SortOrder.Descending: 'descending'>, print_on: bool = False) -> list[arxiv.Result]:
73def search_papers(
74    query: str,
75    max_results: int = 10,
76    sort_by: arxiv.SortCriterion = arxiv.SortCriterion.Relevance,
77    sort_order: arxiv.SortOrder = arxiv.SortOrder.Descending,
78    print_on: bool = False,
79) -> list[arxiv.Result]:
80    search = arxiv.Search(query=query, max_results=max_results, sort_by=sort_by, sort_order=sort_order)
81    papers = _arxiv_call(lambda: list(_client.results(search)))
82
83    if print_on:
84        for paper in papers:
85            print(f"Title:    {paper.title}")
86            print(f"Date:     {paper.published.strftime('%Y-%m-%d')}")
87            print(f"Authors:  {', '.join(author.name for author in paper.authors)}")
88            print(f"Category: {paper.primary_category}")
89            print("-" * 30)
90
91    return papers
def gen_md_files( papers: list[arxiv.Result], additional_tags: None | Sequence[str] = None) -> None:
93def gen_md_files(papers: list[arxiv.Result], additional_tags: None | Sequence[str] = None) -> None:
94    for paper in papers:
95        gen_md_file(paper, additional_tags=additional_tags)
def gen_md_file( paper: arxiv.Result, additional_tags: None | Sequence[str] = None, print_on: bool = False):
 97def gen_md_file(paper: arxiv.Result, additional_tags: None | Sequence[str] = None, print_on: bool = False):
 98    title: str = paper.title
 99    bare_id: str = paper.entry_id.split('/')[-1]
100    source_id: str = f"arxiv:{bare_id}"
101    url: str = f"https://arxiv.org/abs/{bare_id}"
102    authors: list[str] = [author.name for author in paper.authors]
103    tags: list[str] = ["clippings", "research", "clipping"]
104
105    if additional_tags:
106        for s in additional_tags:
107            tags.append(s)
108
109    date = paper.published.strftime('%Y-%m-%d')
110    vault = _vault_dir()
111    vault.mkdir(parents=True, exist_ok=True)
112    filename = vault / f"{bare_id}.md"
113
114    author_list = "\n".join([f'  - "[[{name}]]"' for name in authors])
115    tag_list = "\n".join([f'- {tag}' for tag in tags])
116    with open(resources_dir() / "formats" / "table_format.md", "r", encoding="utf-8") as f:
117        template = f.read()
118        final_content = template.format(
119            title=title,
120            url=url,
121            author_list=author_list,
122            date=date,
123            tag_list=tag_list
124        )
125    if print_on:
126        print(final_content)
127    with open(filename, "w", encoding="utf-8") as f:
128        f.write(final_content)