sources

Paper metadata source abstraction layer.

Add new sources by implementing PaperSource and importing them here.

 1"""Paper metadata source abstraction layer.
 2
 3Add new sources by implementing PaperSource and importing them here.
 4"""
 5
 6from .base import PaperMetadata, PaperSource
 7from .arxiv_source import ArxivSource
 8from .openalex_source import OpenAlexSource
 9from .crossref_source import CrossRefSource, fetch_by_doi, search_by_title
10from .fetch_paper_metadata import fetch_paper_metadata, search_papers, gen_md_file, gen_md_files
11from .doi_resolve import resolve_doi, _resolve_doi
12from .pdf_metadata import extract_pdf_metadata, resolve_pdf_metadata
13
14__all__ = [
15    "PaperMetadata", "PaperSource", "ArxivSource", "OpenAlexSource",
16    "CrossRefSource", "fetch_by_doi", "search_by_title",
17    "fetch_paper_metadata", "search_papers", "gen_md_file", "gen_md_files",
18    "resolve_doi", "_resolve_doi",
19    "extract_pdf_metadata", "resolve_pdf_metadata",
20]
class PaperMetadata(pydantic.main.BaseModel):
11class PaperMetadata(BaseModel):
12    """Normalized paper representation (source-agnostic)."""
13    source_id: str       # namespaced paper ID (e.g. "arxiv:2204.12985", "openalex:W3123456789", "doi:10.48550/...", "local:{hash}")
14    version: int        # defaults to 1 for non-arxiv sources
15    title: str
16    authors: list[str]
17    published: datetime.date
18    updated: datetime.date | None = None
19    summary: str
20    category: str | None = None
21    categories: list[str] | None = None
22    doi: str | None = None
23    journal_ref: str | None = None
24    comment: str | None = None
25    url: str | None = None
26    tags: list[str] | None = None
27    # Identifies which backend produced this record (e.g. 'arxiv', 'openalex').
28    # Must equal the source_name of the PaperSource that fetched it.
29    source: str | None = None

Normalized paper representation (source-agnostic).

source_id: str = PydanticUndefined
version: int = PydanticUndefined
title: str = PydanticUndefined
authors: list[str] = PydanticUndefined
published: datetime.date = PydanticUndefined
updated: datetime.date | None = None
summary: str = PydanticUndefined
category: str | None = None
categories: list[str] | None = None
doi: str | None = None
journal_ref: str | None = None
comment: str | None = None
url: str | None = None
tags: list[str] | None = None
source: str | None = None
class PaperSource(typing.Protocol):
32class PaperSource(Protocol):
33    """Unified interface for paper metadata providers."""
34
35    @property
36    def source_name(self) -> str:
37        """Short identifier for this backend (e.g. 'arxiv', 'openalex').
38
39        Written into PaperMetadata.source on every record this backend produces,
40        so papers can be traced back to the source that fetched them.
41        """
42        ...
43
44    def search(
45        self,
46        query: str,
47        max_results: int = 10,
48        sort: str = "relevance",
49    ) -> list[PaperMetadata]:
50        """Search for papers matching a query string.
51
52        ``sort`` is a source-specific key.  Each source defines the values it
53        accepts and raises ``ValueError`` on unrecognized keys.  Pass
54        ``"relevance"`` (the default) to get each source's default ordering.
55        Sources whose backend does not support sorting accept the parameter but
56        document this explicitly and ignore it.
57        """
58        ...
59
60    def fetch_by_id(self, source_id: str) -> PaperMetadata:
61        """Fetch metadata for a specific paper by its source-specific ID."""
62        ...

Unified interface for paper metadata providers.

PaperSource(*args, **kwargs)
1866def _no_init_or_replace_init(self, *args, **kwargs):
1867    cls = type(self)
1868
1869    if cls._is_protocol:
1870        raise TypeError('Protocols cannot be instantiated')
1871
1872    # Already using a custom `__init__`. No need to calculate correct
1873    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1874    if cls.__init__ is not _no_init_or_replace_init:
1875        return
1876
1877    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1878    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1879    # searches for a proper new `__init__` in the MRO. The new `__init__`
1880    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1881    # instantiation of the protocol subclass will thus use the new
1882    # `__init__` and no longer call `_no_init_or_replace_init`.
1883    for base in cls.__mro__:
1884        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1885        if init is not _no_init_or_replace_init:
1886            cls.__init__ = init
1887            break
1888    else:
1889        # should not happen
1890        cls.__init__ = object.__init__
1891
1892    cls.__init__(self, *args, **kwargs)
source_name: str
35    @property
36    def source_name(self) -> str:
37        """Short identifier for this backend (e.g. 'arxiv', 'openalex').
38
39        Written into PaperMetadata.source on every record this backend produces,
40        so papers can be traced back to the source that fetched them.
41        """
42        ...

Short identifier for this backend (e.g. 'arxiv', 'openalex').

Written into PaperMetadata.source on every record this backend produces, so papers can be traced back to the source that fetched them.

def search( self, query: str, max_results: int = 10, sort: str = 'relevance') -> list[PaperMetadata]:
44    def search(
45        self,
46        query: str,
47        max_results: int = 10,
48        sort: str = "relevance",
49    ) -> list[PaperMetadata]:
50        """Search for papers matching a query string.
51
52        ``sort`` is a source-specific key.  Each source defines the values it
53        accepts and raises ``ValueError`` on unrecognized keys.  Pass
54        ``"relevance"`` (the default) to get each source's default ordering.
55        Sources whose backend does not support sorting accept the parameter but
56        document this explicitly and ignore it.
57        """
58        ...

Search for papers matching a query string.

sort is a source-specific key. Each source defines the values it accepts and raises ValueError on unrecognized keys. Pass "relevance" (the default) to get each source's default ordering. Sources whose backend does not support sorting accept the parameter but document this explicitly and ignore it.

def fetch_by_id(self, source_id: str) -> PaperMetadata:
60    def fetch_by_id(self, source_id: str) -> PaperMetadata:
61        """Fetch metadata for a specific paper by its source-specific ID."""
62        ...

Fetch metadata for a specific paper by its source-specific ID.

class ArxivSource(sources.PaperSource):
 60class ArxivSource(PaperSource):
 61    """Paper source backed by the arXiv API."""
 62
 63    @property
 64    def source_name(self) -> str:
 65        return "arxiv"
 66
 67    # TODO: should these be hardcoded?
 68    def __init__(self) -> None:
 69        self._client = arxiv.Client(num_retries=1, delay_seconds=7.0)
 70
 71    def search(
 72        self,
 73        query: str,
 74        max_results: int = 10,
 75        sort: str = "relevance",
 76    ) -> list[PaperMetadata]:
 77        if sort not in _SORT_MAP:
 78            raise ValueError(f"unknown sort {sort!r}; valid: {sorted(_SORT_MAP)}")
 79        sort_by, sort_order = _SORT_MAP[sort]
 80        search = arxiv.Search(
 81            query=query,
 82            max_results=max_results,
 83            sort_by=sort_by,
 84            sort_order=sort_order,
 85        )
 86        _check_ratelimit()
 87        try:
 88            results = list(self._client.results(search))
 89        except Exception as e:
 90            print(f"[arxiv] search error: {e}")
 91            if "429" in str(e):
 92                _record_ratelimit()
 93            raise ValueError(f"arXiv search failed: {e}") from e
 94        return [_result_to_metadata(r) for r in results]
 95
 96    def fetch_by_id(self, source_id: str) -> PaperMetadata:
 97        bare_id = source_id.removeprefix("arxiv:")
 98        if not bare_id:
 99            raise ValueError(f"source_id '{source_id}' resolves to an empty arXiv ID.")
100        search = arxiv.Search(id_list=[bare_id])
101        _check_ratelimit()
102        try:
103            result = next(self._client.results(search))
104        except StopIteration:
105            raise ArxivNotFoundError(f"Paper '{source_id}' not found on arXiv.") from None
106        except Exception as e:
107            if _ARXIV_EMPTY_PAGE_ERROR and isinstance(e, _ARXIV_EMPTY_PAGE_ERROR):
108                raise ArxivNotFoundError(f"Paper '{source_id}' not found on arXiv.") from None
109            print(f"[arxiv] fetch error: {e}")
110            if "429" in str(e):
111                _record_ratelimit()
112            raise ValueError(f"arXiv fetch failed for '{source_id}': {e}") from e
113        return _result_to_metadata(result)

Paper source backed by the arXiv API.

source_name: str
63    @property
64    def source_name(self) -> str:
65        return "arxiv"

Short identifier for this backend (e.g. 'arxiv', 'openalex').

Written into PaperMetadata.source on every record this backend produces, so papers can be traced back to the source that fetched them.

def search( self, query: str, max_results: int = 10, sort: str = 'relevance') -> list[PaperMetadata]:
71    def search(
72        self,
73        query: str,
74        max_results: int = 10,
75        sort: str = "relevance",
76    ) -> list[PaperMetadata]:
77        if sort not in _SORT_MAP:
78            raise ValueError(f"unknown sort {sort!r}; valid: {sorted(_SORT_MAP)}")
79        sort_by, sort_order = _SORT_MAP[sort]
80        search = arxiv.Search(
81            query=query,
82            max_results=max_results,
83            sort_by=sort_by,
84            sort_order=sort_order,
85        )
86        _check_ratelimit()
87        try:
88            results = list(self._client.results(search))
89        except Exception as e:
90            print(f"[arxiv] search error: {e}")
91            if "429" in str(e):
92                _record_ratelimit()
93            raise ValueError(f"arXiv search failed: {e}") from e
94        return [_result_to_metadata(r) for r in results]

Search for papers matching a query string.

sort is a source-specific key. Each source defines the values it accepts and raises ValueError on unrecognized keys. Pass "relevance" (the default) to get each source's default ordering. Sources whose backend does not support sorting accept the parameter but document this explicitly and ignore it.

def fetch_by_id(self, source_id: str) -> PaperMetadata:
 96    def fetch_by_id(self, source_id: str) -> PaperMetadata:
 97        bare_id = source_id.removeprefix("arxiv:")
 98        if not bare_id:
 99            raise ValueError(f"source_id '{source_id}' resolves to an empty arXiv ID.")
100        search = arxiv.Search(id_list=[bare_id])
101        _check_ratelimit()
102        try:
103            result = next(self._client.results(search))
104        except StopIteration:
105            raise ArxivNotFoundError(f"Paper '{source_id}' not found on arXiv.") from None
106        except Exception as e:
107            if _ARXIV_EMPTY_PAGE_ERROR and isinstance(e, _ARXIV_EMPTY_PAGE_ERROR):
108                raise ArxivNotFoundError(f"Paper '{source_id}' not found on arXiv.") from None
109            print(f"[arxiv] fetch error: {e}")
110            if "429" in str(e):
111                _record_ratelimit()
112            raise ValueError(f"arXiv fetch failed for '{source_id}': {e}") from e
113        return _result_to_metadata(result)

Fetch metadata for a specific paper by its source-specific ID.

class OpenAlexSource(sources.PaperSource):
124class OpenAlexSource(PaperSource):
125    """Paper source backed by the OpenAlex REST API."""
126
127    @property
128    def source_name(self) -> str:
129        return "openalex"
130
131    def __init__(self) -> None:
132        self._http = httpx.Client(
133            base_url=_BASE_URL,
134            timeout=30.0,
135        )
136
137    def search(
138        self,
139        query: str,
140        max_results: int = 10,
141        sort: str = "relevance",
142    ) -> list[PaperMetadata]:
143        if sort not in _SORT_PARAM:
144            raise ValueError(f"unknown sort {sort!r}; valid: {sorted(_SORT_PARAM)}")
145        sanitized = _sanitize_search_query(query)
146        # An empty search returns OpenAlex's unfiltered work list; skip the call.
147        if not sanitized:
148            return []
149        params: dict[str, str | int] = {
150            "search": sanitized,
151            "per_page": max_results,
152            "select": _OPENALEX_WORK_FIELDS,
153            "sort": _SORT_PARAM[sort],
154        }
155        try:
156            response = self._http.get(
157                "/works", params=params, headers={"User-Agent": _user_agent()}
158            )
159            response.raise_for_status()
160            raw_results = response.json().get("results", [])
161        except httpx.HTTPStatusError as e:
162            status = e.response.status_code
163            raise OpenAlexHTTPError(
164                f"OpenAlex search failed: HTTP {status}", status
165            ) from e
166        except Exception as e:
167            raise ValueError(f"OpenAlex search failed: {e}") from e
168        results = []
169        for work in raw_results:
170            try:
171                results.append(_work_to_metadata(work))
172            except Exception as e:
173                print(f"[openalex] skipping malformed work record: {e}")
174        return results
175
176    def fetch_by_id(self, source_id: str) -> PaperMetadata:
177        bare_id = source_id.removeprefix("openalex:")
178        # Normalise any URL form (API or landing page) to a bare work ID.
179        if bare_id.startswith(("http://", "https://")):
180            bare_id = bare_id.rsplit("/", 1)[-1]
181        if not bare_id:
182            raise OpenAlexInputError(
183                f"source_id '{source_id}' resolves to an empty work ID."
184            )
185        if not _WORK_ID_RE.fullmatch(bare_id):
186            raise OpenAlexInputError(
187                f"Invalid OpenAlex work ID '{bare_id}': expected 'W' followed by digits."
188            )
189        try:
190            response = self._http.get(
191                f"/works/{bare_id}",
192                params={"select": _OPENALEX_WORK_FIELDS},
193                headers={"User-Agent": _user_agent()},
194            )
195            response.raise_for_status()
196            return _work_to_metadata(response.json())
197        except httpx.HTTPStatusError as e:
198            status = e.response.status_code
199            if status == 404:
200                raise OpenAlexNotFoundError(
201                    f"Paper '{source_id}' not found on OpenAlex."
202                ) from e
203            raise OpenAlexHTTPError(
204                f"OpenAlex returned HTTP {status} for '{source_id}'.", status
205            ) from e
206        except Exception as e:
207            raise ValueError(f"OpenAlex fetch failed for '{source_id}': {e}") from e

Paper source backed by the OpenAlex REST API.

source_name: str
127    @property
128    def source_name(self) -> str:
129        return "openalex"

Short identifier for this backend (e.g. 'arxiv', 'openalex').

Written into PaperMetadata.source on every record this backend produces, so papers can be traced back to the source that fetched them.

def search( self, query: str, max_results: int = 10, sort: str = 'relevance') -> list[PaperMetadata]:
137    def search(
138        self,
139        query: str,
140        max_results: int = 10,
141        sort: str = "relevance",
142    ) -> list[PaperMetadata]:
143        if sort not in _SORT_PARAM:
144            raise ValueError(f"unknown sort {sort!r}; valid: {sorted(_SORT_PARAM)}")
145        sanitized = _sanitize_search_query(query)
146        # An empty search returns OpenAlex's unfiltered work list; skip the call.
147        if not sanitized:
148            return []
149        params: dict[str, str | int] = {
150            "search": sanitized,
151            "per_page": max_results,
152            "select": _OPENALEX_WORK_FIELDS,
153            "sort": _SORT_PARAM[sort],
154        }
155        try:
156            response = self._http.get(
157                "/works", params=params, headers={"User-Agent": _user_agent()}
158            )
159            response.raise_for_status()
160            raw_results = response.json().get("results", [])
161        except httpx.HTTPStatusError as e:
162            status = e.response.status_code
163            raise OpenAlexHTTPError(
164                f"OpenAlex search failed: HTTP {status}", status
165            ) from e
166        except Exception as e:
167            raise ValueError(f"OpenAlex search failed: {e}") from e
168        results = []
169        for work in raw_results:
170            try:
171                results.append(_work_to_metadata(work))
172            except Exception as e:
173                print(f"[openalex] skipping malformed work record: {e}")
174        return results

Search for papers matching a query string.

sort is a source-specific key. Each source defines the values it accepts and raises ValueError on unrecognized keys. Pass "relevance" (the default) to get each source's default ordering. Sources whose backend does not support sorting accept the parameter but document this explicitly and ignore it.

def fetch_by_id(self, source_id: str) -> PaperMetadata:
176    def fetch_by_id(self, source_id: str) -> PaperMetadata:
177        bare_id = source_id.removeprefix("openalex:")
178        # Normalise any URL form (API or landing page) to a bare work ID.
179        if bare_id.startswith(("http://", "https://")):
180            bare_id = bare_id.rsplit("/", 1)[-1]
181        if not bare_id:
182            raise OpenAlexInputError(
183                f"source_id '{source_id}' resolves to an empty work ID."
184            )
185        if not _WORK_ID_RE.fullmatch(bare_id):
186            raise OpenAlexInputError(
187                f"Invalid OpenAlex work ID '{bare_id}': expected 'W' followed by digits."
188            )
189        try:
190            response = self._http.get(
191                f"/works/{bare_id}",
192                params={"select": _OPENALEX_WORK_FIELDS},
193                headers={"User-Agent": _user_agent()},
194            )
195            response.raise_for_status()
196            return _work_to_metadata(response.json())
197        except httpx.HTTPStatusError as e:
198            status = e.response.status_code
199            if status == 404:
200                raise OpenAlexNotFoundError(
201                    f"Paper '{source_id}' not found on OpenAlex."
202                ) from e
203            raise OpenAlexHTTPError(
204                f"OpenAlex returned HTTP {status} for '{source_id}'.", status
205            ) from e
206        except Exception as e:
207            raise ValueError(f"OpenAlex fetch failed for '{source_id}': {e}") from e

Fetch metadata for a specific paper by its source-specific ID.

class CrossRefSource(sources.PaperSource):
108class CrossRefSource(PaperSource):
109    """Paper source backed by the CrossRef REST API."""
110
111    @property
112    def source_name(self) -> str:
113        return "crossref"
114
115    def search(
116        self,
117        query: str,
118        max_results: int = 10,
119        sort: str = "relevance",
120    ) -> list[PaperMetadata]:
121        # ``sort`` is accepted to satisfy the PaperSource protocol; CrossRef title search
122        # returns results in relevance order regardless.
123        del sort
124        return search_by_title(query, limit=max_results)
125
126    def fetch_by_id(self, source_id: str) -> PaperMetadata:
127        meta = fetch_by_doi(source_id.removeprefix("doi:"))
128        if meta is None:
129            raise ValueError(f"CrossRef: no record found for DOI '{source_id}'")
130        return meta

Paper source backed by the CrossRef REST API.

source_name: str
111    @property
112    def source_name(self) -> str:
113        return "crossref"

Short identifier for this backend (e.g. 'arxiv', 'openalex').

Written into PaperMetadata.source on every record this backend produces, so papers can be traced back to the source that fetched them.

def search( self, query: str, max_results: int = 10, sort: str = 'relevance') -> list[PaperMetadata]:
115    def search(
116        self,
117        query: str,
118        max_results: int = 10,
119        sort: str = "relevance",
120    ) -> list[PaperMetadata]:
121        # ``sort`` is accepted to satisfy the PaperSource protocol; CrossRef title search
122        # returns results in relevance order regardless.
123        del sort
124        return search_by_title(query, limit=max_results)

Search for papers matching a query string.

sort is a source-specific key. Each source defines the values it accepts and raises ValueError on unrecognized keys. Pass "relevance" (the default) to get each source's default ordering. Sources whose backend does not support sorting accept the parameter but document this explicitly and ignore it.

def fetch_by_id(self, source_id: str) -> PaperMetadata:
126    def fetch_by_id(self, source_id: str) -> PaperMetadata:
127        meta = fetch_by_doi(source_id.removeprefix("doi:"))
128        if meta is None:
129            raise ValueError(f"CrossRef: no record found for DOI '{source_id}'")
130        return meta

Fetch metadata for a specific paper by its source-specific ID.

def fetch_by_doi(doi: str) -> PaperMetadata | None:
72def fetch_by_doi(doi: str) -> PaperMetadata | None:
73    """Fetch CrossRef metadata for a DOI. Returns None on any error."""
74    try:
75        with httpx.Client(headers={"User-Agent": _mailto_header()}, timeout=10.0) as client:
76            resp = client.get(f"{CROSSREF_BASE}/{doi}")
77        if resp.status_code != 200:
78            return None
79        msg = resp.json().get("message", {})
80        if not msg.get("title"):
81            return None
82        return _parse_crossref_work(msg, doi=doi)
83    except Exception:
84        return None

Fetch CrossRef metadata for a DOI. Returns None on any error.

def search_by_title(title: str, limit: int = 5) -> list[PaperMetadata]:
 87def search_by_title(title: str, limit: int = 5) -> list[PaperMetadata]:
 88    """Search CrossRef by title. Returns empty list on any error."""
 89    try:
 90        with httpx.Client(headers={"User-Agent": _mailto_header()}, timeout=10.0) as client:
 91            resp = client.get(
 92                CROSSREF_BASE,
 93                params={"query.title": title, "rows": limit},
 94            )
 95        if resp.status_code != 200:
 96            return []
 97        items = resp.json().get("message", {}).get("items", [])
 98        results = []
 99        for item in items:
100            doi = item.get("DOI", "")
101            if item.get("title") and doi:
102                results.append(_parse_crossref_work(item, doi=doi))
103        return results
104    except Exception:
105        return []

Search CrossRef by title. Returns empty list on any error.

def fetch_paper_metadata(source_id: str, print_on: bool = False) -> arxiv.Result:
57def fetch_paper_metadata(source_id: str, print_on: bool = False) -> arxiv.Result:
58    search = arxiv.Search(id_list=[source_id])
59    paper = _arxiv_call(lambda: next(_client.results(search)))
60
61    if print_on:
62        print(f"Title:    {paper.title}")
63        print(f"Date:     {paper.published.strftime('%Y-%m-%d')}")
64        print(f"Authors:  {', '.join(author.name for author in paper.authors)}")
65        print(f"Category: {paper.primary_category}")
66        print(f"DOI:      {paper.doi}")
67        print(f"PDF URL:  {paper.pdf_url}")
68        print("-" * 30)
69        print(f"Abstract:\n{paper.summary}")
70
71    return paper
def search_papers( query: str, max_results: int = 10, sort_by: arxiv.SortCriterion = <SortCriterion.Relevance: 'relevance'>, sort_order: arxiv.SortOrder = <SortOrder.Descending: 'descending'>, print_on: bool = False) -> list[arxiv.Result]:
73def search_papers(
74    query: str,
75    max_results: int = 10,
76    sort_by: arxiv.SortCriterion = arxiv.SortCriterion.Relevance,
77    sort_order: arxiv.SortOrder = arxiv.SortOrder.Descending,
78    print_on: bool = False,
79) -> list[arxiv.Result]:
80    search = arxiv.Search(query=query, max_results=max_results, sort_by=sort_by, sort_order=sort_order)
81    papers = _arxiv_call(lambda: list(_client.results(search)))
82
83    if print_on:
84        for paper in papers:
85            print(f"Title:    {paper.title}")
86            print(f"Date:     {paper.published.strftime('%Y-%m-%d')}")
87            print(f"Authors:  {', '.join(author.name for author in paper.authors)}")
88            print(f"Category: {paper.primary_category}")
89            print("-" * 30)
90
91    return papers
def gen_md_file( paper: arxiv.Result, additional_tags: None | Sequence[str] = None, print_on: bool = False):
 97def gen_md_file(paper: arxiv.Result, additional_tags: None | Sequence[str] = None, print_on: bool = False):
 98    title: str = paper.title
 99    bare_id: str = paper.entry_id.split('/')[-1]
100    source_id: str = f"arxiv:{bare_id}"
101    url: str = f"https://arxiv.org/abs/{bare_id}"
102    authors: list[str] = [author.name for author in paper.authors]
103    tags: list[str] = ["clippings", "research", "clipping"]
104
105    if additional_tags:
106        for s in additional_tags:
107            tags.append(s)
108
109    date = paper.published.strftime('%Y-%m-%d')
110    vault = _vault_dir()
111    vault.mkdir(parents=True, exist_ok=True)
112    filename = vault / f"{bare_id}.md"
113
114    author_list = "\n".join([f'  - "[[{name}]]"' for name in authors])
115    tag_list = "\n".join([f'- {tag}' for tag in tags])
116    with open(resources_dir() / "formats" / "table_format.md", "r", encoding="utf-8") as f:
117        template = f.read()
118        final_content = template.format(
119            title=title,
120            url=url,
121            author_list=author_list,
122            date=date,
123            tag_list=tag_list
124        )
125    if print_on:
126        print(final_content)
127    with open(filename, "w", encoding="utf-8") as f:
128        f.write(final_content)
def gen_md_files( papers: list[arxiv.Result], additional_tags: None | Sequence[str] = None) -> None:
93def gen_md_files(papers: list[arxiv.Result], additional_tags: None | Sequence[str] = None) -> None:
94    for paper in papers:
95        gen_md_file(paper, additional_tags=additional_tags)
def resolve_doi(doi: str) -> PaperMetadata:
171def resolve_doi(doi: str) -> PaperMetadata:
172    """
173    Resolve a DOI to PaperMetadata via three strategies:
174      1. arXiv-issued DOI  → fetch directly from arXiv
175      2. Semantic Scholar  → resolves any DOI; uses arXiv ID when available
176      3. CrossRef          → last resort; broadest DOI coverage
177    Raises ValueError with a human-readable message on failure.
178    """
179    doi = _strip_doi_url(doi)
180    if not doi:
181        raise ValueError("Please enter a DOI.")
182
183    meta = _try_arxiv_doi(doi)
184    if meta:
185        return meta
186
187    meta = _try_semantic_scholar(doi)
188    if meta:
189        return meta
190
191    meta = _try_crossref(doi)
192    if meta:
193        return meta
194
195    raise ValueError(
196        "Could not resolve this DOI.\n"
197        "• Check the DOI is correct\n"
198        "• The paper may not be indexed by Semantic Scholar or CrossRef\n"
199        "• arXiv-hosted papers use DOIs starting with 10.48550/arXiv."
200    )

Resolve a DOI to PaperMetadata via three strategies:

  1. arXiv-issued DOI → fetch directly from arXiv
  2. Semantic Scholar → resolves any DOI; uses arXiv ID when available
  3. CrossRef → last resort; broadest DOI coverage Raises ValueError with a human-readable message on failure.
def _resolve_doi(doi: str) -> PaperMetadata:
203def _resolve_doi(doi: str) -> PaperMetadata:
204    """Backward-compatible name used by the PyQt DOI page."""
205    return resolve_doi(doi)

Backward-compatible name used by the PyQt DOI page.

def extract_pdf_metadata(pdf_path: str) -> dict:
117def extract_pdf_metadata(pdf_path: str) -> dict:
118    """Extract raw metadata from a PDF's embedded fields and first-page text.
119
120    Returns a dict with keys: title, authors, doi, year, abstract.
121    Values are None when not found. `abstract` is always None at this stage —
122    the PDF /Subject field is not a reliable abstract source.
123    """
124    reader = PdfReader(pdf_path)
125    meta = reader.metadata or {}
126
127    title: Optional[str] = meta.get("/Title") or None
128    if title and _looks_autogenerated(title):
129        title = None
130
131    authors: Optional[str] = meta.get("/Author") or None
132    if authors and _looks_junk_author(authors):
133        authors = None
134
135    doi: Optional[str] = None
136    arxiv_id: Optional[str] = None
137    first_page_text = ""
138    if reader.pages:
139        first_page_text = reader.pages[0].extract_text() or ""
140        doi = _extract_doi(first_page_text)
141        arxiv_id = _extract_arxiv_id(first_page_text)
142
143    if not title and first_page_text:
144        title = _extract_title_from_text(first_page_text)
145
146    year: Optional[int] = None
147    date_str = meta.get("/CreationDate") or meta.get("/ModDate")
148    if date_str:
149        m = re.search(r"(\d{4})", str(date_str))
150        if m:
151            candidate = int(m.group(1))
152            if 1900 <= candidate <= datetime.date.today().year + 1:
153                year = candidate
154
155    return {
156        "title": title,
157        "authors": authors,
158        "doi": doi,
159        "arxiv_id": arxiv_id,
160        "year": year,
161        "abstract": None,
162    }

Extract raw metadata from a PDF's embedded fields and first-page text.

Returns a dict with keys: title, authors, doi, year, abstract. Values are None when not found. abstract is always None at this stage — the PDF /Subject field is not a reliable abstract source.

def resolve_pdf_metadata( pdf_path: str) -> tuple[PaperMetadata, tuple[str, int] | None]:
165def resolve_pdf_metadata(
166    pdf_path: str,
167) -> tuple[PaperMetadata, tuple[str, int] | None]:
168    """Run the full metadata pipeline for a PDF file.
169
170    Returns ``(meta, external_identity)`` where:
171      - ``meta`` is a PaperMetadata with a ``local:<sha256>`` source_id so the
172        imported paper defaults to a local identity regardless of where it was
173        sourced. Metadata fields (title, authors, etc.) are enriched from
174        arXiv/CrossRef when an ID is found in the first-page text.
175      - ``external_identity`` is ``(source_id, version)`` of the upstream
176        record we matched (e.g. ``("arxiv:2204.12985", 4)`` or
177        ``("doi:10.1234/xyz", 1)``) when enrichment succeeded, else ``None``.
178        Callers that have DB access can use this to dedupe against existing
179        paper roots — see ``service/paper.py:import_pdf``.
180
181    Resolution order for metadata enrichment:
182      1. arXiv ID found in first-page text  -> ArxivSource.fetch_by_id()
183      2. DOI found in first-page text       -> resolve_doi()
184      3. Title (embedded or text heuristic) -> CrossRef title search
185      4. Partial record from whatever was extracted (caller prompts for missing fields)
186
187    Raises FileNotFoundError if the path does not exist, or pypdf errors on corrupt PDFs.
188    """
189    raw = extract_pdf_metadata(pdf_path)
190    local_id = _pdf_source_id(pdf_path)
191
192    enriched: PaperMetadata | None = None
193
194    if raw["arxiv_id"]:
195        try:
196            enriched = ArxivSource().fetch_by_id(raw["arxiv_id"])
197        except Exception:
198            pass
199
200    if not enriched and raw["doi"]:
201        try:
202            enriched = resolve_doi(raw["doi"])
203        except ValueError:
204            pass
205
206    if not enriched and raw["title"]:
207        enriched = _try_crossref_title(raw["title"])
208
209    if enriched:
210        external = (
211            (enriched.source_id, enriched.version)
212            if enriched.source_id
213            else None
214        )
215        meta = PaperMetadata(
216            source_id=local_id,
217            version=1,
218            title=enriched.title or raw["title"] or "",
219            authors=enriched.authors,
220            published=enriched.published,
221            summary=enriched.summary,
222            doi=enriched.doi,
223            url=enriched.url,
224            category=enriched.category,
225            source="pdf",
226        )
227        return meta, external
228
229    authors: list[str] = []
230    if raw["authors"]:
231        # Split on semicolons first, then commas — handles "Doe, John; Smith, Jane"
232        parts = re.split(r";", raw["authors"])
233        authors = [p.strip() for p in parts if p.strip()]
234
235    pub_date = datetime.date.today()
236    if raw["year"]:
237        pub_date = datetime.date(raw["year"], 1, 1)
238
239    meta = PaperMetadata(
240        source_id=local_id,
241        version=1,
242        title=raw["title"] or "",
243        authors=authors,
244        published=pub_date,
245        summary="",
246        source="pdf",
247    )
248    return meta, None

Run the full metadata pipeline for a PDF file.

Returns (meta, external_identity) where:

  • meta is a PaperMetadata with a local:<sha256> source_id so the imported paper defaults to a local identity regardless of where it was sourced. Metadata fields (title, authors, etc.) are enriched from arXiv/CrossRef when an ID is found in the first-page text.
  • external_identity is (source_id, version) of the upstream record we matched (e.g. ("arxiv:2204.12985", 4) or ("doi:10.1234/xyz", 1)) when enrichment succeeded, else None. Callers that have DB access can use this to dedupe against existing paper roots — see service/paper.py:import_pdf.

Resolution order for metadata enrichment:

  1. arXiv ID found in first-page text -> ArxivSource.fetch_by_id()
  2. DOI found in first-page text -> resolve_doi()
  3. Title (embedded or text heuristic) -> CrossRef title search
  4. Partial record from whatever was extracted (caller prompts for missing fields)

Raises FileNotFoundError if the path does not exist, or pypdf errors on corrupt PDFs.