Skip to content

utils.io

Utils - io¤

is_dir(path) ¤

Check if the given path is a directory

Parameters:

Name Type Description Default
path Union[str, Path]

path to be checked

required
Source code in kirsche/utils/io.py
27
28
29
30
31
32
33
34
35
36
37
38
def is_dir(path: Union[str, Path]) -> bool:
    """Check if the given path is a directory

    :param path: path to be checked
    """
    if isinstance(path, str):
        path = Path(path)

    if path.exists():
        return path.is_dir()
    else:
        return str(path).endswith("/")

load_batch_json(data_path) ¤

load data from json file(s)

If the given data_path is a folder, all the json files in the folder are loaded. If the given data_path is a single json file, everything inside the file will be loaded.

Parameters:

Name Type Description Default
data_path Union[str, Path]

json file path

required
Source code in kirsche/utils/io.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def load_batch_json(data_path: Union[str, Path]) -> Union[dict, list]:
    """load data from json file(s)

    If the given `data_path` is a folder, all the json files in the folder are loaded. If the given `data_path` is a single json file, everything inside the file will be loaded.

    :param data_path: json file path
    """
    if isinstance(data_path, str):
        data_path = Path(data_path)

    if not data_path.exists():
        return []

    if data_path.is_dir():
        logger.debug(f"loading all data files from {data_path} folder")
        data_path_all_json = list(data_path.glob("*.json"))
        logger.debug(f"Found {len(data_path_all_json)} json files in {data_path}.")
        data = []
        for data_file in data_path_all_json:
            logger.debug(f"loading data from {data_file}")
            data.append(load_json(data_file))
    elif data_path.is_file():
        logger.debug(f"loading data from a single file {data_path}")
        data = load_json(data_path)

    return data

load_json(data_file) ¤

load dict/list of dict data from json file

Parameters:

Name Type Description Default
data_file Union[str, Path]

json file path

required
Source code in kirsche/utils/io.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def load_json(data_file: Union[str, Path]) -> dict:
    """load dict/list of dict data from json file

    :param data_file: json file path
    """
    if isinstance(data_file, str):
        data_file = Path(data_file)
    if not data_file.exists():
        raise Exception(f"File not found: {data_file}")

    logger.debug(f"loading data from {data_file}")
    with open(data_file, "r") as f:
        data = json.load(f)

    return data.copy()

record_exists(id, existing_records, keys=UNIQUE_ID_PRECEDENCE, unique_id_prefix=UNIQUE_ID_PREFIX) ¤

Whether the record already exists in the data file

Parameters:

Name Type Description Default
id

json files folder path

required
Source code in kirsche/utils/io.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def record_exists(
    id,
    existing_records: list,
    keys: list = UNIQUE_ID_PRECEDENCE,
    unique_id_prefix: list = UNIQUE_ID_PREFIX,
) -> bool:
    """Whether the record already exists in the data file

    :param id: json files folder path
    """

    if keys is None:
        keys = UNIQUE_ID_PRECEDENCE
    if unique_id_prefix is None:
        unique_id_prefix = UNIQUE_ID_PREFIX

    # The keys are most likely to have prefixes, e.g., arXiv:, PMID:
    # we need to strip them out before checking if the record exists
    cleansing_id = id
    for k, v in unique_id_prefix.items():
        if id.startswith(v):
            cleansing_id = id.replace(v, "")
            break

    # set default value to False as this is more acceptable in most cases
    # if we missed something in the lookup, we will redownload the data
    # if we use False as default. This is no big deal.
    # Otherwise, we might miss some data.
    exists = False

    for record in existing_records:
        for k in keys:
            k_value = record.get(k, "")
            if k_value is None:
                continue
            elif k_value.lower() == cleansing_id.lower():
                return True

    return exists

save_batch_json(records, data_path, unique_key=None, mode=None) ¤

save data to json file.

There are two modes: - single file mode, if the data_path is a folder, and - multi file mode, if the data_path is a json file path.

In the single file mode, all the entries in the data are saved to the same file. In the multi file mode, each entry will be saved as a separate file.

Single file mode is good for long term presevation, and multi file mode is good for updates.

Parameters:

Name Type Description Default
records list

list of data to be saved

required
data_path Union[str, Path]

json file path or folder path

required
Source code in kirsche/utils/io.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def save_batch_json(records: list, data_path: Union[str, Path], unique_key=None, mode=None) -> None:
    """save data to json file.

    There are two modes:
    - single file mode, if the `data_path` is a folder, and
    - multi file mode, if the `data_path` is a json file path.

    In the single file mode, all the entries in the data are saved to the same file. In the multi file mode, each entry will be saved as a separate file.

    Single file mode is good for long term presevation, and multi file mode is good for updates.

    :param records: list of data to be saved
    :param data_path: json file path or folder path
    """
    if isinstance(data_path, str):
        data_path = Path(data_path)

    if not data_path.exists():
        if str(data_path).endswith(".json"):
            mode = "single"
        else:
            mode = "multi"

    if unique_key is None:
        unique_key = "corpusId"

    if data_path.is_dir() or (mode == "multi"):
        if not data_path.exists():
            data_path.mkdir(parents=True)
        logger.debug(f"saving all data records to {data_path} folder")
        data_path_all_json = list(data_path.glob("*.json"))
        logger.debug(f"Found {len(data_path_all_json)} json files in {data_path}.")
        data = []
        for record in records:
            # Construct path
            try:
                unique_key_value = record[unique_key]
            except KeyError:
                logger.error(f"{unique_key} not found in {record}")
                continue

            data_file = data_path / f"{unique_key_value}.json"
            logger.debug(f"saving data to {data_file}")
            save_json(record, data_file)

    else:
        logger.debug(f"loading data from a single file {data_path}")
        save_json(records, data_path)

save_json(data, data_file) ¤

save data to json file

This Function Overwrites any Existing Content

Beware that all contents in the file will be overwritten if it exists.

Parameters:

Name Type Description Default
data Union[dict, list]

dictionary data to be saved

required
data_file Union[str, Path]

json file path

required
Source code in kirsche/utils/io.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def save_json(data: Union[dict, list], data_file: Union[str, Path]) -> None:
    """save data to json file

    !!! warning "This Function Overwrites any Existing Content"
        Beware that all contents in the file will be overwritten if it exists.

    :param data: dictionary data to be saved
    :param data_file: json file path
    """
    if isinstance(data_file, str):
        data_file = Path(data_file)
    if data_file.exists():
        logger.warning(f"{data_file} exists! Will replace the content")

    logger.debug(f"saving data to {data_file}")
    with open(data_file, "w") as f:
        json.dump(data, f, indent=4)