sources
Paper metadata source abstraction layer.
Add new sources by implementing PaperSource and importing them here.
1"""Paper metadata source abstraction layer. 2 3Add new sources by implementing PaperSource and importing them here. 4""" 5 6from .base import PaperMetadata, PaperSource 7from .arxiv_source import ArxivSource 8from .openalex_source import OpenAlexSource 9from .crossref_source import CrossRefSource, fetch_by_doi, search_by_title 10from .fetch_paper_metadata import fetch_paper_metadata, search_papers, gen_md_file, gen_md_files 11from .doi_resolve import resolve_doi, _resolve_doi 12from .pdf_metadata import extract_pdf_metadata, resolve_pdf_metadata 13 14__all__ = [ 15 "PaperMetadata", "PaperSource", "ArxivSource", "OpenAlexSource", 16 "CrossRefSource", "fetch_by_doi", "search_by_title", 17 "fetch_paper_metadata", "search_papers", "gen_md_file", "gen_md_files", 18 "resolve_doi", "_resolve_doi", 19 "extract_pdf_metadata", "resolve_pdf_metadata", 20]
11class PaperMetadata(BaseModel): 12 """Normalized paper representation (source-agnostic).""" 13 source_id: str # namespaced paper ID (e.g. "arxiv:2204.12985", "openalex:W3123456789", "doi:10.48550/...", "local:{hash}") 14 version: int # defaults to 1 for non-arxiv sources 15 title: str 16 authors: list[str] 17 published: datetime.date 18 updated: datetime.date | None = None 19 summary: str 20 category: str | None = None 21 categories: list[str] | None = None 22 doi: str | None = None 23 journal_ref: str | None = None 24 comment: str | None = None 25 url: str | None = None 26 tags: list[str] | None = None 27 # Identifies which backend produced this record (e.g. 'arxiv', 'openalex'). 28 # Must equal the source_name of the PaperSource that fetched it. 29 source: str | None = None
Normalized paper representation (source-agnostic).
32class PaperSource(Protocol): 33 """Unified interface for paper metadata providers.""" 34 35 @property 36 def source_name(self) -> str: 37 """Short identifier for this backend (e.g. 'arxiv', 'openalex'). 38 39 Written into PaperMetadata.source on every record this backend produces, 40 so papers can be traced back to the source that fetched them. 41 """ 42 ... 43 44 def search( 45 self, 46 query: str, 47 max_results: int = 10, 48 sort: str = "relevance", 49 ) -> list[PaperMetadata]: 50 """Search for papers matching a query string. 51 52 ``sort`` is a source-specific key. Each source defines the values it 53 accepts and raises ``ValueError`` on unrecognized keys. Pass 54 ``"relevance"`` (the default) to get each source's default ordering. 55 Sources whose backend does not support sorting accept the parameter but 56 document this explicitly and ignore it. 57 """ 58 ... 59 60 def fetch_by_id(self, source_id: str) -> PaperMetadata: 61 """Fetch metadata for a specific paper by its source-specific ID.""" 62 ...
Unified interface for paper metadata providers.
1866def _no_init_or_replace_init(self, *args, **kwargs): 1867 cls = type(self) 1868 1869 if cls._is_protocol: 1870 raise TypeError('Protocols cannot be instantiated') 1871 1872 # Already using a custom `__init__`. No need to calculate correct 1873 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1874 if cls.__init__ is not _no_init_or_replace_init: 1875 return 1876 1877 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1878 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1879 # searches for a proper new `__init__` in the MRO. The new `__init__` 1880 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1881 # instantiation of the protocol subclass will thus use the new 1882 # `__init__` and no longer call `_no_init_or_replace_init`. 1883 for base in cls.__mro__: 1884 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1885 if init is not _no_init_or_replace_init: 1886 cls.__init__ = init 1887 break 1888 else: 1889 # should not happen 1890 cls.__init__ = object.__init__ 1891 1892 cls.__init__(self, *args, **kwargs)
35 @property 36 def source_name(self) -> str: 37 """Short identifier for this backend (e.g. 'arxiv', 'openalex'). 38 39 Written into PaperMetadata.source on every record this backend produces, 40 so papers can be traced back to the source that fetched them. 41 """ 42 ...
Short identifier for this backend (e.g. 'arxiv', 'openalex').
Written into PaperMetadata.source on every record this backend produces, so papers can be traced back to the source that fetched them.
44 def search( 45 self, 46 query: str, 47 max_results: int = 10, 48 sort: str = "relevance", 49 ) -> list[PaperMetadata]: 50 """Search for papers matching a query string. 51 52 ``sort`` is a source-specific key. Each source defines the values it 53 accepts and raises ``ValueError`` on unrecognized keys. Pass 54 ``"relevance"`` (the default) to get each source's default ordering. 55 Sources whose backend does not support sorting accept the parameter but 56 document this explicitly and ignore it. 57 """ 58 ...
Search for papers matching a query string.
sort is a source-specific key. Each source defines the values it
accepts and raises ValueError on unrecognized keys. Pass
"relevance" (the default) to get each source's default ordering.
Sources whose backend does not support sorting accept the parameter but
document this explicitly and ignore it.
60class ArxivSource(PaperSource): 61 """Paper source backed by the arXiv API.""" 62 63 @property 64 def source_name(self) -> str: 65 return "arxiv" 66 67 # TODO: should these be hardcoded? 68 def __init__(self) -> None: 69 self._client = arxiv.Client(num_retries=1, delay_seconds=7.0) 70 71 def search( 72 self, 73 query: str, 74 max_results: int = 10, 75 sort: str = "relevance", 76 ) -> list[PaperMetadata]: 77 if sort not in _SORT_MAP: 78 raise ValueError(f"unknown sort {sort!r}; valid: {sorted(_SORT_MAP)}") 79 sort_by, sort_order = _SORT_MAP[sort] 80 search = arxiv.Search( 81 query=query, 82 max_results=max_results, 83 sort_by=sort_by, 84 sort_order=sort_order, 85 ) 86 _check_ratelimit() 87 try: 88 results = list(self._client.results(search)) 89 except Exception as e: 90 print(f"[arxiv] search error: {e}") 91 if "429" in str(e): 92 _record_ratelimit() 93 raise ValueError(f"arXiv search failed: {e}") from e 94 return [_result_to_metadata(r) for r in results] 95 96 def fetch_by_id(self, source_id: str) -> PaperMetadata: 97 bare_id = source_id.removeprefix("arxiv:") 98 if not bare_id: 99 raise ValueError(f"source_id '{source_id}' resolves to an empty arXiv ID.") 100 search = arxiv.Search(id_list=[bare_id]) 101 _check_ratelimit() 102 try: 103 result = next(self._client.results(search)) 104 except StopIteration: 105 raise ArxivNotFoundError(f"Paper '{source_id}' not found on arXiv.") from None 106 except Exception as e: 107 if _ARXIV_EMPTY_PAGE_ERROR and isinstance(e, _ARXIV_EMPTY_PAGE_ERROR): 108 raise ArxivNotFoundError(f"Paper '{source_id}' not found on arXiv.") from None 109 print(f"[arxiv] fetch error: {e}") 110 if "429" in str(e): 111 _record_ratelimit() 112 raise ValueError(f"arXiv fetch failed for '{source_id}': {e}") from e 113 return _result_to_metadata(result)
Paper source backed by the arXiv API.
Short identifier for this backend (e.g. 'arxiv', 'openalex').
Written into PaperMetadata.source on every record this backend produces, so papers can be traced back to the source that fetched them.
71 def search( 72 self, 73 query: str, 74 max_results: int = 10, 75 sort: str = "relevance", 76 ) -> list[PaperMetadata]: 77 if sort not in _SORT_MAP: 78 raise ValueError(f"unknown sort {sort!r}; valid: {sorted(_SORT_MAP)}") 79 sort_by, sort_order = _SORT_MAP[sort] 80 search = arxiv.Search( 81 query=query, 82 max_results=max_results, 83 sort_by=sort_by, 84 sort_order=sort_order, 85 ) 86 _check_ratelimit() 87 try: 88 results = list(self._client.results(search)) 89 except Exception as e: 90 print(f"[arxiv] search error: {e}") 91 if "429" in str(e): 92 _record_ratelimit() 93 raise ValueError(f"arXiv search failed: {e}") from e 94 return [_result_to_metadata(r) for r in results]
Search for papers matching a query string.
sort is a source-specific key. Each source defines the values it
accepts and raises ValueError on unrecognized keys. Pass
"relevance" (the default) to get each source's default ordering.
Sources whose backend does not support sorting accept the parameter but
document this explicitly and ignore it.
96 def fetch_by_id(self, source_id: str) -> PaperMetadata: 97 bare_id = source_id.removeprefix("arxiv:") 98 if not bare_id: 99 raise ValueError(f"source_id '{source_id}' resolves to an empty arXiv ID.") 100 search = arxiv.Search(id_list=[bare_id]) 101 _check_ratelimit() 102 try: 103 result = next(self._client.results(search)) 104 except StopIteration: 105 raise ArxivNotFoundError(f"Paper '{source_id}' not found on arXiv.") from None 106 except Exception as e: 107 if _ARXIV_EMPTY_PAGE_ERROR and isinstance(e, _ARXIV_EMPTY_PAGE_ERROR): 108 raise ArxivNotFoundError(f"Paper '{source_id}' not found on arXiv.") from None 109 print(f"[arxiv] fetch error: {e}") 110 if "429" in str(e): 111 _record_ratelimit() 112 raise ValueError(f"arXiv fetch failed for '{source_id}': {e}") from e 113 return _result_to_metadata(result)
Fetch metadata for a specific paper by its source-specific ID.
124class OpenAlexSource(PaperSource): 125 """Paper source backed by the OpenAlex REST API.""" 126 127 @property 128 def source_name(self) -> str: 129 return "openalex" 130 131 def __init__(self) -> None: 132 self._http = httpx.Client( 133 base_url=_BASE_URL, 134 timeout=30.0, 135 ) 136 137 def search( 138 self, 139 query: str, 140 max_results: int = 10, 141 sort: str = "relevance", 142 ) -> list[PaperMetadata]: 143 if sort not in _SORT_PARAM: 144 raise ValueError(f"unknown sort {sort!r}; valid: {sorted(_SORT_PARAM)}") 145 sanitized = _sanitize_search_query(query) 146 # An empty search returns OpenAlex's unfiltered work list; skip the call. 147 if not sanitized: 148 return [] 149 params: dict[str, str | int] = { 150 "search": sanitized, 151 "per_page": max_results, 152 "select": _OPENALEX_WORK_FIELDS, 153 "sort": _SORT_PARAM[sort], 154 } 155 try: 156 response = self._http.get( 157 "/works", params=params, headers={"User-Agent": _user_agent()} 158 ) 159 response.raise_for_status() 160 raw_results = response.json().get("results", []) 161 except httpx.HTTPStatusError as e: 162 status = e.response.status_code 163 raise OpenAlexHTTPError( 164 f"OpenAlex search failed: HTTP {status}", status 165 ) from e 166 except Exception as e: 167 raise ValueError(f"OpenAlex search failed: {e}") from e 168 results = [] 169 for work in raw_results: 170 try: 171 results.append(_work_to_metadata(work)) 172 except Exception as e: 173 print(f"[openalex] skipping malformed work record: {e}") 174 return results 175 176 def fetch_by_id(self, source_id: str) -> PaperMetadata: 177 bare_id = source_id.removeprefix("openalex:") 178 # Normalise any URL form (API or landing page) to a bare work ID. 179 if bare_id.startswith(("http://", "https://")): 180 bare_id = bare_id.rsplit("/", 1)[-1] 181 if not bare_id: 182 raise OpenAlexInputError( 183 f"source_id '{source_id}' resolves to an empty work ID." 184 ) 185 if not _WORK_ID_RE.fullmatch(bare_id): 186 raise OpenAlexInputError( 187 f"Invalid OpenAlex work ID '{bare_id}': expected 'W' followed by digits." 188 ) 189 try: 190 response = self._http.get( 191 f"/works/{bare_id}", 192 params={"select": _OPENALEX_WORK_FIELDS}, 193 headers={"User-Agent": _user_agent()}, 194 ) 195 response.raise_for_status() 196 return _work_to_metadata(response.json()) 197 except httpx.HTTPStatusError as e: 198 status = e.response.status_code 199 if status == 404: 200 raise OpenAlexNotFoundError( 201 f"Paper '{source_id}' not found on OpenAlex." 202 ) from e 203 raise OpenAlexHTTPError( 204 f"OpenAlex returned HTTP {status} for '{source_id}'.", status 205 ) from e 206 except Exception as e: 207 raise ValueError(f"OpenAlex fetch failed for '{source_id}': {e}") from e
Paper source backed by the OpenAlex REST API.
Short identifier for this backend (e.g. 'arxiv', 'openalex').
Written into PaperMetadata.source on every record this backend produces, so papers can be traced back to the source that fetched them.
137 def search( 138 self, 139 query: str, 140 max_results: int = 10, 141 sort: str = "relevance", 142 ) -> list[PaperMetadata]: 143 if sort not in _SORT_PARAM: 144 raise ValueError(f"unknown sort {sort!r}; valid: {sorted(_SORT_PARAM)}") 145 sanitized = _sanitize_search_query(query) 146 # An empty search returns OpenAlex's unfiltered work list; skip the call. 147 if not sanitized: 148 return [] 149 params: dict[str, str | int] = { 150 "search": sanitized, 151 "per_page": max_results, 152 "select": _OPENALEX_WORK_FIELDS, 153 "sort": _SORT_PARAM[sort], 154 } 155 try: 156 response = self._http.get( 157 "/works", params=params, headers={"User-Agent": _user_agent()} 158 ) 159 response.raise_for_status() 160 raw_results = response.json().get("results", []) 161 except httpx.HTTPStatusError as e: 162 status = e.response.status_code 163 raise OpenAlexHTTPError( 164 f"OpenAlex search failed: HTTP {status}", status 165 ) from e 166 except Exception as e: 167 raise ValueError(f"OpenAlex search failed: {e}") from e 168 results = [] 169 for work in raw_results: 170 try: 171 results.append(_work_to_metadata(work)) 172 except Exception as e: 173 print(f"[openalex] skipping malformed work record: {e}") 174 return results
Search for papers matching a query string.
sort is a source-specific key. Each source defines the values it
accepts and raises ValueError on unrecognized keys. Pass
"relevance" (the default) to get each source's default ordering.
Sources whose backend does not support sorting accept the parameter but
document this explicitly and ignore it.
176 def fetch_by_id(self, source_id: str) -> PaperMetadata: 177 bare_id = source_id.removeprefix("openalex:") 178 # Normalise any URL form (API or landing page) to a bare work ID. 179 if bare_id.startswith(("http://", "https://")): 180 bare_id = bare_id.rsplit("/", 1)[-1] 181 if not bare_id: 182 raise OpenAlexInputError( 183 f"source_id '{source_id}' resolves to an empty work ID." 184 ) 185 if not _WORK_ID_RE.fullmatch(bare_id): 186 raise OpenAlexInputError( 187 f"Invalid OpenAlex work ID '{bare_id}': expected 'W' followed by digits." 188 ) 189 try: 190 response = self._http.get( 191 f"/works/{bare_id}", 192 params={"select": _OPENALEX_WORK_FIELDS}, 193 headers={"User-Agent": _user_agent()}, 194 ) 195 response.raise_for_status() 196 return _work_to_metadata(response.json()) 197 except httpx.HTTPStatusError as e: 198 status = e.response.status_code 199 if status == 404: 200 raise OpenAlexNotFoundError( 201 f"Paper '{source_id}' not found on OpenAlex." 202 ) from e 203 raise OpenAlexHTTPError( 204 f"OpenAlex returned HTTP {status} for '{source_id}'.", status 205 ) from e 206 except Exception as e: 207 raise ValueError(f"OpenAlex fetch failed for '{source_id}': {e}") from e
Fetch metadata for a specific paper by its source-specific ID.
108class CrossRefSource(PaperSource): 109 """Paper source backed by the CrossRef REST API.""" 110 111 @property 112 def source_name(self) -> str: 113 return "crossref" 114 115 def search( 116 self, 117 query: str, 118 max_results: int = 10, 119 sort: str = "relevance", 120 ) -> list[PaperMetadata]: 121 # ``sort`` is accepted to satisfy the PaperSource protocol; CrossRef title search 122 # returns results in relevance order regardless. 123 del sort 124 return search_by_title(query, limit=max_results) 125 126 def fetch_by_id(self, source_id: str) -> PaperMetadata: 127 meta = fetch_by_doi(source_id.removeprefix("doi:")) 128 if meta is None: 129 raise ValueError(f"CrossRef: no record found for DOI '{source_id}'") 130 return meta
Paper source backed by the CrossRef REST API.
Short identifier for this backend (e.g. 'arxiv', 'openalex').
Written into PaperMetadata.source on every record this backend produces, so papers can be traced back to the source that fetched them.
115 def search( 116 self, 117 query: str, 118 max_results: int = 10, 119 sort: str = "relevance", 120 ) -> list[PaperMetadata]: 121 # ``sort`` is accepted to satisfy the PaperSource protocol; CrossRef title search 122 # returns results in relevance order regardless. 123 del sort 124 return search_by_title(query, limit=max_results)
Search for papers matching a query string.
sort is a source-specific key. Each source defines the values it
accepts and raises ValueError on unrecognized keys. Pass
"relevance" (the default) to get each source's default ordering.
Sources whose backend does not support sorting accept the parameter but
document this explicitly and ignore it.
72def fetch_by_doi(doi: str) -> PaperMetadata | None: 73 """Fetch CrossRef metadata for a DOI. Returns None on any error.""" 74 try: 75 with httpx.Client(headers={"User-Agent": _mailto_header()}, timeout=10.0) as client: 76 resp = client.get(f"{CROSSREF_BASE}/{doi}") 77 if resp.status_code != 200: 78 return None 79 msg = resp.json().get("message", {}) 80 if not msg.get("title"): 81 return None 82 return _parse_crossref_work(msg, doi=doi) 83 except Exception: 84 return None
Fetch CrossRef metadata for a DOI. Returns None on any error.
87def search_by_title(title: str, limit: int = 5) -> list[PaperMetadata]: 88 """Search CrossRef by title. Returns empty list on any error.""" 89 try: 90 with httpx.Client(headers={"User-Agent": _mailto_header()}, timeout=10.0) as client: 91 resp = client.get( 92 CROSSREF_BASE, 93 params={"query.title": title, "rows": limit}, 94 ) 95 if resp.status_code != 200: 96 return [] 97 items = resp.json().get("message", {}).get("items", []) 98 results = [] 99 for item in items: 100 doi = item.get("DOI", "") 101 if item.get("title") and doi: 102 results.append(_parse_crossref_work(item, doi=doi)) 103 return results 104 except Exception: 105 return []
Search CrossRef by title. Returns empty list on any error.
57def fetch_paper_metadata(source_id: str, print_on: bool = False) -> arxiv.Result: 58 search = arxiv.Search(id_list=[source_id]) 59 paper = _arxiv_call(lambda: next(_client.results(search))) 60 61 if print_on: 62 print(f"Title: {paper.title}") 63 print(f"Date: {paper.published.strftime('%Y-%m-%d')}") 64 print(f"Authors: {', '.join(author.name for author in paper.authors)}") 65 print(f"Category: {paper.primary_category}") 66 print(f"DOI: {paper.doi}") 67 print(f"PDF URL: {paper.pdf_url}") 68 print("-" * 30) 69 print(f"Abstract:\n{paper.summary}") 70 71 return paper
73def search_papers( 74 query: str, 75 max_results: int = 10, 76 sort_by: arxiv.SortCriterion = arxiv.SortCriterion.Relevance, 77 sort_order: arxiv.SortOrder = arxiv.SortOrder.Descending, 78 print_on: bool = False, 79) -> list[arxiv.Result]: 80 search = arxiv.Search(query=query, max_results=max_results, sort_by=sort_by, sort_order=sort_order) 81 papers = _arxiv_call(lambda: list(_client.results(search))) 82 83 if print_on: 84 for paper in papers: 85 print(f"Title: {paper.title}") 86 print(f"Date: {paper.published.strftime('%Y-%m-%d')}") 87 print(f"Authors: {', '.join(author.name for author in paper.authors)}") 88 print(f"Category: {paper.primary_category}") 89 print("-" * 30) 90 91 return papers
97def gen_md_file(paper: arxiv.Result, additional_tags: None | Sequence[str] = None, print_on: bool = False): 98 title: str = paper.title 99 bare_id: str = paper.entry_id.split('/')[-1] 100 source_id: str = f"arxiv:{bare_id}" 101 url: str = f"https://arxiv.org/abs/{bare_id}" 102 authors: list[str] = [author.name for author in paper.authors] 103 tags: list[str] = ["clippings", "research", "clipping"] 104 105 if additional_tags: 106 for s in additional_tags: 107 tags.append(s) 108 109 date = paper.published.strftime('%Y-%m-%d') 110 vault = _vault_dir() 111 vault.mkdir(parents=True, exist_ok=True) 112 filename = vault / f"{bare_id}.md" 113 114 author_list = "\n".join([f' - "[[{name}]]"' for name in authors]) 115 tag_list = "\n".join([f'- {tag}' for tag in tags]) 116 with open(resources_dir() / "formats" / "table_format.md", "r", encoding="utf-8") as f: 117 template = f.read() 118 final_content = template.format( 119 title=title, 120 url=url, 121 author_list=author_list, 122 date=date, 123 tag_list=tag_list 124 ) 125 if print_on: 126 print(final_content) 127 with open(filename, "w", encoding="utf-8") as f: 128 f.write(final_content)
171def resolve_doi(doi: str) -> PaperMetadata: 172 """ 173 Resolve a DOI to PaperMetadata via three strategies: 174 1. arXiv-issued DOI → fetch directly from arXiv 175 2. Semantic Scholar → resolves any DOI; uses arXiv ID when available 176 3. CrossRef → last resort; broadest DOI coverage 177 Raises ValueError with a human-readable message on failure. 178 """ 179 doi = _strip_doi_url(doi) 180 if not doi: 181 raise ValueError("Please enter a DOI.") 182 183 meta = _try_arxiv_doi(doi) 184 if meta: 185 return meta 186 187 meta = _try_semantic_scholar(doi) 188 if meta: 189 return meta 190 191 meta = _try_crossref(doi) 192 if meta: 193 return meta 194 195 raise ValueError( 196 "Could not resolve this DOI.\n" 197 "• Check the DOI is correct\n" 198 "• The paper may not be indexed by Semantic Scholar or CrossRef\n" 199 "• arXiv-hosted papers use DOIs starting with 10.48550/arXiv." 200 )
Resolve a DOI to PaperMetadata via three strategies:
- arXiv-issued DOI → fetch directly from arXiv
- Semantic Scholar → resolves any DOI; uses arXiv ID when available
- CrossRef → last resort; broadest DOI coverage Raises ValueError with a human-readable message on failure.
203def _resolve_doi(doi: str) -> PaperMetadata: 204 """Backward-compatible name used by the PyQt DOI page.""" 205 return resolve_doi(doi)
Backward-compatible name used by the PyQt DOI page.
117def extract_pdf_metadata(pdf_path: str) -> dict: 118 """Extract raw metadata from a PDF's embedded fields and first-page text. 119 120 Returns a dict with keys: title, authors, doi, year, abstract. 121 Values are None when not found. `abstract` is always None at this stage — 122 the PDF /Subject field is not a reliable abstract source. 123 """ 124 reader = PdfReader(pdf_path) 125 meta = reader.metadata or {} 126 127 title: Optional[str] = meta.get("/Title") or None 128 if title and _looks_autogenerated(title): 129 title = None 130 131 authors: Optional[str] = meta.get("/Author") or None 132 if authors and _looks_junk_author(authors): 133 authors = None 134 135 doi: Optional[str] = None 136 arxiv_id: Optional[str] = None 137 first_page_text = "" 138 if reader.pages: 139 first_page_text = reader.pages[0].extract_text() or "" 140 doi = _extract_doi(first_page_text) 141 arxiv_id = _extract_arxiv_id(first_page_text) 142 143 if not title and first_page_text: 144 title = _extract_title_from_text(first_page_text) 145 146 year: Optional[int] = None 147 date_str = meta.get("/CreationDate") or meta.get("/ModDate") 148 if date_str: 149 m = re.search(r"(\d{4})", str(date_str)) 150 if m: 151 candidate = int(m.group(1)) 152 if 1900 <= candidate <= datetime.date.today().year + 1: 153 year = candidate 154 155 return { 156 "title": title, 157 "authors": authors, 158 "doi": doi, 159 "arxiv_id": arxiv_id, 160 "year": year, 161 "abstract": None, 162 }
Extract raw metadata from a PDF's embedded fields and first-page text.
Returns a dict with keys: title, authors, doi, year, abstract.
Values are None when not found. abstract is always None at this stage —
the PDF /Subject field is not a reliable abstract source.
165def resolve_pdf_metadata( 166 pdf_path: str, 167) -> tuple[PaperMetadata, tuple[str, int] | None]: 168 """Run the full metadata pipeline for a PDF file. 169 170 Returns ``(meta, external_identity)`` where: 171 - ``meta`` is a PaperMetadata with a ``local:<sha256>`` source_id so the 172 imported paper defaults to a local identity regardless of where it was 173 sourced. Metadata fields (title, authors, etc.) are enriched from 174 arXiv/CrossRef when an ID is found in the first-page text. 175 - ``external_identity`` is ``(source_id, version)`` of the upstream 176 record we matched (e.g. ``("arxiv:2204.12985", 4)`` or 177 ``("doi:10.1234/xyz", 1)``) when enrichment succeeded, else ``None``. 178 Callers that have DB access can use this to dedupe against existing 179 paper roots — see ``service/paper.py:import_pdf``. 180 181 Resolution order for metadata enrichment: 182 1. arXiv ID found in first-page text -> ArxivSource.fetch_by_id() 183 2. DOI found in first-page text -> resolve_doi() 184 3. Title (embedded or text heuristic) -> CrossRef title search 185 4. Partial record from whatever was extracted (caller prompts for missing fields) 186 187 Raises FileNotFoundError if the path does not exist, or pypdf errors on corrupt PDFs. 188 """ 189 raw = extract_pdf_metadata(pdf_path) 190 local_id = _pdf_source_id(pdf_path) 191 192 enriched: PaperMetadata | None = None 193 194 if raw["arxiv_id"]: 195 try: 196 enriched = ArxivSource().fetch_by_id(raw["arxiv_id"]) 197 except Exception: 198 pass 199 200 if not enriched and raw["doi"]: 201 try: 202 enriched = resolve_doi(raw["doi"]) 203 except ValueError: 204 pass 205 206 if not enriched and raw["title"]: 207 enriched = _try_crossref_title(raw["title"]) 208 209 if enriched: 210 external = ( 211 (enriched.source_id, enriched.version) 212 if enriched.source_id 213 else None 214 ) 215 meta = PaperMetadata( 216 source_id=local_id, 217 version=1, 218 title=enriched.title or raw["title"] or "", 219 authors=enriched.authors, 220 published=enriched.published, 221 summary=enriched.summary, 222 doi=enriched.doi, 223 url=enriched.url, 224 category=enriched.category, 225 source="pdf", 226 ) 227 return meta, external 228 229 authors: list[str] = [] 230 if raw["authors"]: 231 # Split on semicolons first, then commas — handles "Doe, John; Smith, Jane" 232 parts = re.split(r";", raw["authors"]) 233 authors = [p.strip() for p in parts if p.strip()] 234 235 pub_date = datetime.date.today() 236 if raw["year"]: 237 pub_date = datetime.date(raw["year"], 1, 1) 238 239 meta = PaperMetadata( 240 source_id=local_id, 241 version=1, 242 title=raw["title"] or "", 243 authors=authors, 244 published=pub_date, 245 summary="", 246 source="pdf", 247 ) 248 return meta, None
Run the full metadata pipeline for a PDF file.
Returns (meta, external_identity) where:
metais a PaperMetadata with alocal:<sha256>source_id so the imported paper defaults to a local identity regardless of where it was sourced. Metadata fields (title, authors, etc.) are enriched from arXiv/CrossRef when an ID is found in the first-page text.external_identityis(source_id, version)of the upstream record we matched (e.g.("arxiv:2204.12985", 4)or("doi:10.1234/xyz", 1)) when enrichment succeeded, elseNone. Callers that have DB access can use this to dedupe against existing paper roots — seeservice/paper.py:import_pdf.
Resolution order for metadata enrichment:
- arXiv ID found in first-page text -> ArxivSource.fetch_by_id()
- DOI found in first-page text -> resolve_doi()
- Title (embedded or text heuristic) -> CrossRef title search
- Partial record from whatever was extracted (caller prompts for missing fields)
Raises FileNotFoundError if the path does not exist, or pypdf errors on corrupt PDFs.