Skip to content

utils.bib

Utils - bib¤

get_dois_from_bib(bib_file) ¤

[Deprecated] use get_unique_ids_from_bib instead.

get_dois_from_bib returns a list of DOIs from a bib file

Parameters:

Name Type Description Default
bib_file Union[str, Path]

path to bib file

required

Returns:

Type Description
list

list of DOIs

Source code in kirsche/utils/bib.py
111
112
113
114
115
116
117
118
119
120
121
def get_dois_from_bib(bib_file: Union[str, Path]) -> list:
    """
    [Deprecated] use get_unique_ids_from_bib instead.

    get_dois_from_bib returns a list of DOIs from a bib file

    :param bib_file: path to bib file
    :return: list of DOIs
    """

    return get_unique_ids_from_bib(bib_file, key="doi")

get_dois_from_bib_re(bib_file) ¤

Retrieve DOIs by parsing bib file line by line.

Source code in kirsche/utils/bib.py
28
29
30
31
32
33
34
35
36
37
38
39
def get_dois_from_bib_re(bib_file: Union[str, Path]) -> list:
    """Retrieve DOIs by parsing bib file line by line."""

    with open(bib_file, "r") as bibtex_file:
        data = bibtex_file.readlines()

    re_doi = re.compile(r"^doi\s=\s\{(?P<doi>.+)\}")

    dois = [re_doi.search(i.strip()) for i in data]
    dois = [i.group("doi") for i in dois if i is not None]

    return dois

get_unique_ids_from_bib(bib_file, keys=None, unique_id_prefix=None) ¤

get_unique_ids_from_bib returns a list of unique IDs from a bib file for a given key or list of keys.

By default, the key is "doi". It can also be - arxivid, which can return values like arXiv:0804.4726 - pmid, which can return values like PMID:26017442.

keys can also be a list of keys to check in order of priority, e.g., ["doi", "arxivid", "pmid"].

For each record, we will look up the paper based on the order of the list. If doi exists, the functioin will use doi and start the next record. If doi is not found, the function will use arxivid for the same record.

If only one value in keys is specified, we will the same key specified for all records.

Parameters:

Name Type Description Default
bib_file Union[str, Path]

path to bib file

required
keys Optional[Union[str, list]]

key to use to find unique ids in the bib data, default is doi.

None
unique_id_prefix dict

prefix to use for unique ids

None

Returns:

Type Description
list

list of DOIs

Source code in kirsche/utils/bib.py
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def get_unique_ids_from_bib(
    bib_file: Union[str, Path],
    keys: Optional[Union[str, list]] = None,
    unique_id_prefix: dict = None,
) -> list:
    """
    get_unique_ids_from_bib returns a list of unique IDs from a bib file for a given key or list of keys.

    By default, the key is "doi". It can also be
    - arxivid, which can return values like arXiv:0804.4726
    - pmid, which can return values like PMID:26017442.

    keys can also be a list of keys to check in order of priority, e.g., `["doi", "arxivid", "pmid"]`.

    For each record, we will look up the paper based on the order of the list. If doi exists, the functioin will use doi and start the next record. If doi is not found, the function will use arxivid for the same record.

    If only one value in keys is specified, we will the same key specified for all records.

    :param bib_file: path to bib file
    :param keys: key to use to find unique ids in the bib data, default is doi.
    :param unique_id_prefix: prefix to use for unique ids
    :return: list of DOIs
    """
    if keys is None:
        keys = UNIQUE_ID_PRECEDENCE
    elif isinstance(keys, str):
        keys = [keys]

    if unique_id_prefix is None:
        unique_id_prefix = UNIQUE_ID_PREFIX

    bib_data = load_bib(bib_file)

    if keys:
        ids = parse_unique_ids_by_keys(bib_data, keys, unique_id_prefix)
    else:
        raise ValueError("key or key_precedence must be specified")

    return ids

load_bib(bib_file) ¤

Load bib content from bib files

Source code in kirsche/utils/bib.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def load_bib(bib_file: Union[str, Path]) -> list:
    """Load bib content from bib files"""

    if isinstance(bib_file, str):
        bib_file = Path(bib_file)

    if not bib_file.exists():
        raise FileNotFoundError(f"{bib_file} does not exist")

    with open(bib_file, "r") as bibtex_file:
        parser = BibTexParser(common_strings=True)
        parser.customization = convert_to_unicode
        bib_database = bibtexparser.load(bibtex_file, parser=parser)

    return bib_database.entries

parse_unique_ids_by_keys(bib_data, keys, unique_id_prefix) ¤

parse_unique_ids_by_keys parses bib data based on keys.

Parameters:

Name Type Description Default
bib_data list

list of bib records loaded from a bib file

required
keys Union[str, list]

list of keys as the lookup order, e.g., ["doi", "arxivid", "pmid"]

required
unique_id_prefix dict

a dictionary to specify what prefix to use for each key

required
Source code in kirsche/utils/bib.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def parse_unique_ids_by_keys(
    bib_data: list, keys: Union[str, list], unique_id_prefix: dict
) -> list:
    """
    parse_unique_ids_by_keys parses bib data based on keys.

    :param bib_data: list of bib records loaded from a bib file
    :param keys: list of keys as the lookup order, e.g., ["doi", "arxivid", "pmid"]
    :param unique_id_prefix: a dictionary to specify what prefix to use for each key
    """

    ids = []
    for i in bib_data:
        i_unique_id = ""
        for k in keys:
            k_prefix = unique_id_prefix.get(k, "")
            i_k_value = i.get(k, "")
            # We do not need the version in the arxivids
            if k == "arxivid":
                i_k_value = i_k_value.split("v")[0]
            if i_k_value:
                i_unique_id = f"{k_prefix}{i_k_value}"
                break
        ids.append(i_unique_id)

    return ids