formats

File-format registry for paper import/export.

Usage

from formats import registry, format_for_extension

fmt = format_for_extension(".bib") # -> BibTeXFormat instance papers = fmt.import_file("refs.bib") # -> list[PaperMetadata]

Adding a new format

  1. Create formats/.py implementing PaperFileFormat (see formats/base.py).
  2. Import and register it here.
 1"""File-format registry for paper import/export.
 2
 3Usage
 4-----
 5from formats import registry, format_for_extension
 6
 7fmt = format_for_extension(".bib")   # -> BibTeXFormat instance
 8papers = fmt.import_file("refs.bib") # -> list[PaperMetadata]
 9
10Adding a new format
11-------------------
121. Create formats/<name>.py implementing PaperFileFormat (see formats/base.py).
132. Import and register it here.
14"""
15
16from __future__ import annotations
17
18from formats.base import PaperFileFormat
19from formats.json_fmt import JSONFormat
20from formats.csv_fmt import CSVFormat, TSVFormat
21from formats.bibtex import BibTeXFormat
22from formats.markdown import MarkdownFormat, ObsidianFormat
23
24registry: dict[str, PaperFileFormat] = {
25    "json":     JSONFormat(),
26    "csv":      CSVFormat(),
27    "tsv":      TSVFormat(),
28    "bibtex":   BibTeXFormat(),
29    "markdown": MarkdownFormat(),
30    "obsidian": ObsidianFormat(),
31}
32
33
34def format_for_extension(ext: str) -> PaperFileFormat | None:
35    """Return the format handler for a file extension (e.g. '.bib'), or None."""
36    ext = ext.lower()
37    for fmt in registry.values():
38        if ext in fmt.extensions:
39            return fmt
40    return None
41
42
43__all__ = ["PaperFileFormat", "registry", "format_for_extension"]
@runtime_checkable
class PaperFileFormat(typing.Protocol):
16@runtime_checkable
17class PaperFileFormat(Protocol):
18    format_name: str       # e.g. "json", "csv", "bibtex"
19    extensions: list[str]  # e.g. [".json"], [".bib"]
20
21    def import_file(self, path: str) -> list[PaperMetadata]:
22        """Parse a file and return PaperMetadata records."""
23        ...
24
25    def export_papers(self, papers: list[dict]) -> str:
26        """Serialize paper dicts to a string in this format."""
27        ...

Base class for protocol classes.

Protocol classes are defined as::

class Proto(Protocol):
    def meth(self) -> int:
        ...

Such classes are primarily used with static type checkers that recognize structural subtyping (static duck-typing).

For example::

class C:
    def meth(self) -> int:
        return 0

def func(x: Proto) -> int:
    return x.meth()

func(C())  # Passes static type check

See PEP 544 for details. Protocol classes decorated with @typing.runtime_checkable act as simple-minded runtime protocols that check only the presence of given attributes, ignoring their type signatures. Protocol classes can be generic, they are defined as::

class GenProto[T](Protocol):
    def meth(self) -> T:
        ...
PaperFileFormat(*args, **kwargs)
1866def _no_init_or_replace_init(self, *args, **kwargs):
1867    cls = type(self)
1868
1869    if cls._is_protocol:
1870        raise TypeError('Protocols cannot be instantiated')
1871
1872    # Already using a custom `__init__`. No need to calculate correct
1873    # `__init__` to call. This can lead to RecursionError. See bpo-45121.
1874    if cls.__init__ is not _no_init_or_replace_init:
1875        return
1876
1877    # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`.
1878    # The first instantiation of the subclass will call `_no_init_or_replace_init` which
1879    # searches for a proper new `__init__` in the MRO. The new `__init__`
1880    # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent
1881    # instantiation of the protocol subclass will thus use the new
1882    # `__init__` and no longer call `_no_init_or_replace_init`.
1883    for base in cls.__mro__:
1884        init = base.__dict__.get('__init__', _no_init_or_replace_init)
1885        if init is not _no_init_or_replace_init:
1886            cls.__init__ = init
1887            break
1888    else:
1889        # should not happen
1890        cls.__init__ = object.__init__
1891
1892    cls.__init__(self, *args, **kwargs)
format_name: str
extensions: list[str]
def import_file(self, path: str) -> list[sources.PaperMetadata]:
21    def import_file(self, path: str) -> list[PaperMetadata]:
22        """Parse a file and return PaperMetadata records."""
23        ...

Parse a file and return PaperMetadata records.

def export_papers(self, papers: list[dict]) -> str:
25    def export_papers(self, papers: list[dict]) -> str:
26        """Serialize paper dicts to a string in this format."""
27        ...

Serialize paper dicts to a string in this format.

registry: dict[str, PaperFileFormat] = {'json': <formats.json_fmt.JSONFormat object>, 'csv': <formats.csv_fmt.CSVFormat object>, 'tsv': <formats.csv_fmt.TSVFormat object>, 'bibtex': <formats.bibtex.BibTeXFormat object>, 'markdown': <formats.markdown.MarkdownFormat object>, 'obsidian': <formats.markdown.ObsidianFormat object>}
def format_for_extension(ext: str) -> PaperFileFormat | None:
35def format_for_extension(ext: str) -> PaperFileFormat | None:
36    """Return the format handler for a file extension (e.g. '.bib'), or None."""
37    ext = ext.lower()
38    for fmt in registry.values():
39        if ext in fmt.extensions:
40            return fmt
41    return None

Return the format handler for a file extension (e.g. '.bib'), or None.