sources.fetch_paper_metadata
1import time 2import arxiv 3from datetime import datetime 4from pathlib import Path 5from typing import Sequence 6 7from config import data_dir, resources_dir 8 9_client = arxiv.Client(num_retries=1, delay_seconds=7.0) 10 11_RATELIMIT_WAIT = 60.0 12 13 14def _vault_dir() -> Path: 15 """Obsidian vault location — resolved off data_dir() so it tracks LINXIV_DATA_DIR 16 like the DB and PDFs, instead of being pinned to the source tree. Called per-use 17 (not cached at import) to avoid the import-time-frozen trap.""" 18 return data_dir() / "obsidian_vault" / "arXivVault" 19 20 21def _ratelimit_file() -> Path: 22 """arXiv rate-limit timestamp file — lives in data_dir() so it tracks LINXIV_DATA_DIR 23 like the DB, PDFs, and vault. Resolved per-use (not cached at import).""" 24 return data_dir() / ".arxiv_ratelimit" 25 26 27def _check_ratelimit() -> None: 28 path = _ratelimit_file() 29 if not path.exists(): 30 return 31 with open(path) as f: 32 last = datetime.fromisoformat(f.read().strip()) 33 remaining = _RATELIMIT_WAIT - (datetime.now() - last).total_seconds() 34 if remaining > 0: 35 print(f"[arxiv] rate limited — waiting {remaining:.0f}s") 36 time.sleep(remaining) 37 38 39def _record_ratelimit() -> None: 40 path = _ratelimit_file() 41 path.parent.mkdir(parents=True, exist_ok=True) 42 with open(path, "w") as f: 43 f.write(datetime.now().isoformat()) 44 45 46def _arxiv_call(fn): 47 _check_ratelimit() 48 try: 49 return fn() 50 except Exception as e: 51 if "429" in str(e): 52 _record_ratelimit() 53 print("[arxiv] 429 received — recorded. Retry your search in 60s.") 54 raise 55 56def fetch_paper_metadata(source_id: str, print_on: bool = False) -> arxiv.Result: 57 search = arxiv.Search(id_list=[source_id]) 58 paper = _arxiv_call(lambda: next(_client.results(search))) 59 60 if print_on: 61 print(f"Title: {paper.title}") 62 print(f"Date: {paper.published.strftime('%Y-%m-%d')}") 63 print(f"Authors: {', '.join(author.name for author in paper.authors)}") 64 print(f"Category: {paper.primary_category}") 65 print(f"DOI: {paper.doi}") 66 print(f"PDF URL: {paper.pdf_url}") 67 print("-" * 30) 68 print(f"Abstract:\n{paper.summary}") 69 70 return paper 71 72def search_papers( 73 query: str, 74 max_results: int = 10, 75 sort_by: arxiv.SortCriterion = arxiv.SortCriterion.Relevance, 76 sort_order: arxiv.SortOrder = arxiv.SortOrder.Descending, 77 print_on: bool = False, 78) -> list[arxiv.Result]: 79 search = arxiv.Search(query=query, max_results=max_results, sort_by=sort_by, sort_order=sort_order) 80 papers = _arxiv_call(lambda: list(_client.results(search))) 81 82 if print_on: 83 for paper in papers: 84 print(f"Title: {paper.title}") 85 print(f"Date: {paper.published.strftime('%Y-%m-%d')}") 86 print(f"Authors: {', '.join(author.name for author in paper.authors)}") 87 print(f"Category: {paper.primary_category}") 88 print("-" * 30) 89 90 return papers 91 92def gen_md_files(papers: list[arxiv.Result], additional_tags: None | Sequence[str] = None) -> None: 93 for paper in papers: 94 gen_md_file(paper, additional_tags=additional_tags) 95 96def gen_md_file(paper: arxiv.Result, additional_tags: None | Sequence[str] = None, print_on: bool = False): 97 title: str = paper.title 98 bare_id: str = paper.entry_id.split('/')[-1] 99 source_id: str = f"arxiv:{bare_id}" 100 url: str = f"https://arxiv.org/abs/{bare_id}" 101 authors: list[str] = [author.name for author in paper.authors] 102 tags: list[str] = ["clippings", "research", "clipping"] 103 104 if additional_tags: 105 for s in additional_tags: 106 tags.append(s) 107 108 date = paper.published.strftime('%Y-%m-%d') 109 vault = _vault_dir() 110 vault.mkdir(parents=True, exist_ok=True) 111 filename = vault / f"{bare_id}.md" 112 113 author_list = "\n".join([f' - "[[{name}]]"' for name in authors]) 114 tag_list = "\n".join([f'- {tag}' for tag in tags]) 115 with open(resources_dir() / "formats" / "table_format.md", "r", encoding="utf-8") as f: 116 template = f.read() 117 final_content = template.format( 118 title=title, 119 url=url, 120 author_list=author_list, 121 date=date, 122 tag_list=tag_list 123 ) 124 if print_on: 125 print(final_content) 126 with open(filename, "w", encoding="utf-8") as f: 127 f.write(final_content)
def
fetch_paper_metadata(source_id: str, print_on: bool = False) -> arxiv.Result:
57def fetch_paper_metadata(source_id: str, print_on: bool = False) -> arxiv.Result: 58 search = arxiv.Search(id_list=[source_id]) 59 paper = _arxiv_call(lambda: next(_client.results(search))) 60 61 if print_on: 62 print(f"Title: {paper.title}") 63 print(f"Date: {paper.published.strftime('%Y-%m-%d')}") 64 print(f"Authors: {', '.join(author.name for author in paper.authors)}") 65 print(f"Category: {paper.primary_category}") 66 print(f"DOI: {paper.doi}") 67 print(f"PDF URL: {paper.pdf_url}") 68 print("-" * 30) 69 print(f"Abstract:\n{paper.summary}") 70 71 return paper
def
search_papers( query: str, max_results: int = 10, sort_by: arxiv.SortCriterion = <SortCriterion.Relevance: 'relevance'>, sort_order: arxiv.SortOrder = <SortOrder.Descending: 'descending'>, print_on: bool = False) -> list[arxiv.Result]:
73def search_papers( 74 query: str, 75 max_results: int = 10, 76 sort_by: arxiv.SortCriterion = arxiv.SortCriterion.Relevance, 77 sort_order: arxiv.SortOrder = arxiv.SortOrder.Descending, 78 print_on: bool = False, 79) -> list[arxiv.Result]: 80 search = arxiv.Search(query=query, max_results=max_results, sort_by=sort_by, sort_order=sort_order) 81 papers = _arxiv_call(lambda: list(_client.results(search))) 82 83 if print_on: 84 for paper in papers: 85 print(f"Title: {paper.title}") 86 print(f"Date: {paper.published.strftime('%Y-%m-%d')}") 87 print(f"Authors: {', '.join(author.name for author in paper.authors)}") 88 print(f"Category: {paper.primary_category}") 89 print("-" * 30) 90 91 return papers
def
gen_md_files( papers: list[arxiv.Result], additional_tags: None | Sequence[str] = None) -> None:
def
gen_md_file( paper: arxiv.Result, additional_tags: None | Sequence[str] = None, print_on: bool = False):
97def gen_md_file(paper: arxiv.Result, additional_tags: None | Sequence[str] = None, print_on: bool = False): 98 title: str = paper.title 99 bare_id: str = paper.entry_id.split('/')[-1] 100 source_id: str = f"arxiv:{bare_id}" 101 url: str = f"https://arxiv.org/abs/{bare_id}" 102 authors: list[str] = [author.name for author in paper.authors] 103 tags: list[str] = ["clippings", "research", "clipping"] 104 105 if additional_tags: 106 for s in additional_tags: 107 tags.append(s) 108 109 date = paper.published.strftime('%Y-%m-%d') 110 vault = _vault_dir() 111 vault.mkdir(parents=True, exist_ok=True) 112 filename = vault / f"{bare_id}.md" 113 114 author_list = "\n".join([f' - "[[{name}]]"' for name in authors]) 115 tag_list = "\n".join([f'- {tag}' for tag in tags]) 116 with open(resources_dir() / "formats" / "table_format.md", "r", encoding="utf-8") as f: 117 template = f.read() 118 final_content = template.format( 119 title=title, 120 url=url, 121 author_list=author_list, 122 date=date, 123 tag_list=tag_list 124 ) 125 if print_on: 126 print(final_content) 127 with open(filename, "w", encoding="utf-8") as f: 128 f.write(final_content)