Skip to content

csv_utils

Process CSV files and add more robustness to the data manipulation by extracting and using a data type stored in the header of the CSV for each column.

Functions:

Name Description
check_type

Check if the type of the given value is one of the given expected types.

csv_format_type

Convert a given value extracted from CSV into the given type.

csv_get_row_value

Extract the value of a specific column of the given row, and convert it ot the type specified by the column header.

csv_read_attributes

Read all the values from a given CSV file and convert all the values according to the types specified by the header.

string_to_type

Return the actual Python type corresponding to the string.

check_type(value, column_types)

Check if the type of the given value is one of the given expected types.

Parameters:

Name Type Description Default
value typing.Any

Value to check.

required
column_types list[str]

List of allowed types, represented as strings.

required

Returns:

Type Description
bool

Whether the type of the value is one of the expected types.

Source code in python/src/data_pipeline/utils/csv_utils.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def check_type(value: Any, column_types: list[str]) -> bool:
    """
    Check if the type of the given value is one of the given expected types.

    Parameters
    ----------
    value : Any
        Value to check.
    column_types : list[str]
        List of allowed types, represented as strings.

    Returns
    -------
    bool
        Whether the type of the value is one of the expected types.
    """
    for column_type in column_types:
        real_type = string_to_type(column_type)
        if real_type == list:
            if not isinstance(value, list):
                continue

            list_info = column_type[len("list") :]
            separator = list_info[0]
            other_type = list_info[1:]
            if all([check_type(item, [other_type]) for item in value]):
                return True
        elif isinstance(value, real_type):
            return True

    return False

csv_format_type(value, column_type)

Convert a given value extracted from CSV into the given type.

Parameters:

Name Type Description Default
value str

Initial value.

required
column_type str

Type to transform it into.

required

Returns:

Type Description
typing.Any

The converted value.

Raises:

Type Description
NotImplementedError

If the given type is unsupported.

Source code in python/src/data_pipeline/utils/csv_utils.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def csv_format_type(value: str, column_type: str) -> Any:
    """
    Convert a given value extracted from CSV into the given type.

    Parameters
    ----------
    value : str
        Initial value.
    column_type : str
        Type to transform it into.

    Returns
    -------
    Any
        The converted value.

    Raises
    ------
    NotImplementedError
        If the given type is unsupported.
    """
    if column_type == "str":
        return str(value)
    elif column_type == "float":
        if value == "":
            return None
        return float(value.replace(",", "."))
    elif column_type == "int":
        if value == "":
            return None
        return int(value)
    elif column_type == "bool":
        if value == "":
            return None
        return value.lower() == "true"
    elif column_type.startswith("list"):
        if value == "":
            return []
        list_info = column_type[len("list") :]
        separator = list_info[0]
        other_type = list_info[1:]
        return [
            csv_format_type(value=v, column_type=other_type)
            for v in value.split(separator)
        ]
    else:
        raise NotImplementedError(
            f"Support for type '{column_type}' is not implemented yet."
        )

csv_get_row_value(row, column)

Extract the value of a specific column of the given row, and convert it ot the type specified by the column header.

Parameters:

Name Type Description Default
row dict[str, str]

The row as a mapping the column name to the value.

required
column str

The column to extract.

required

Returns:

Type Description
str

The actual name of the column without the type.

typing.Any

The value of the column.

Raises:

Type Description
RuntimeError

If the column header is not formatted properly.

RuntimeError

If the type of the output is not correct.

Source code in python/src/data_pipeline/utils/csv_utils.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def csv_get_row_value(row: dict[str, str], column: str) -> tuple[str, Any]:
    """
    Extract the value of a specific column of the given row, and convert it ot the type specified by the column header.

    Parameters
    ----------
    row : dict[str, str]
        The row as a mapping the column name to the value.
    column : str
        The column to extract.

    Returns
    -------
    str
        The actual name of the column without the type.
    Any
        The value of the column.

    Raises
    ------
    RuntimeError
        If the column header is not formatted properly.
    RuntimeError
        If the type of the output is not correct.
    """
    column_split = column.split(" [")
    if len(column_split) != 2:
        raise RuntimeError(
            f"The column name should look like this: '<Name> [<type>]', but it is '{column}'."
        )
    column_type = column_split[1][:-1]
    column_name = column_split[0]

    value = row[column]
    if isinstance(value, str):
        value = csv_format_type(value=value.strip(), column_type=column_type)

    if check_type(value=value, column_types=[column_type]) or value is None:
        pass
    else:
        raise RuntimeError(
            f"The column '{column}' gave a value of type {type(value)} ({value})."
        )

    return column_name, value

csv_read_attributes(csv_path, specific_columns=())

Read all the values from a given CSV file and convert all the values according to the types specified by the header. Some specific columns that have to be in the file can be specified.

Parameters:

Name Type Description Default
csv_path pathlib.Path

The path to the CSV file.

required
specific_columns tuple[str, ...]

The mandatory columns. By default ().

()

Returns:

Name Type Description
attributes_all list[dict[str, typing.Any]]

All the values of all the rows, except the ones given in specific_columns. Stored as dictionaries mapping the actual column name (without the type) to the value converted to the right type.

specific_values_all list[tuple[tuple[str, typing.Any], ...]]

The values of the columns specified in specific_columns, in the same order. Each element of the list corresponds to one row, in the same order as in attributes_all.

Raises:

Type Description
RuntimeError

If two columns end up with the same name after removing the type.

Source code in python/src/data_pipeline/utils/csv_utils.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def csv_read_attributes(
    csv_path: Path, specific_columns: tuple[str, ...] = ()
) -> tuple[list[dict[str, Any]], list[tuple[tuple[str, Any], ...]]]:
    """
    Read all the values from a given CSV file and convert all the values according to the types specified by the header.
    Some specific columns that have to be in the file can be specified.

    Parameters
    ----------
    csv_path : Path
        The path to the CSV file.
    specific_columns : tuple[str, ...], optional
        The mandatory columns.
        By default ().

    Returns
    -------
    attributes_all: list[dict[str, Any]]
        All the values of all the rows, except the ones given in `specific_columns`.
        Stored as dictionaries mapping the actual column name (without the type) to the value converted to the right type.
    specific_values_all: list[tuple[tuple[str, Any], ...]]
        The values of the columns specified in `specific_columns`, in the same order.
        Each element of the list corresponds to one row, in the same order as in `attributes_all`.

    Raises
    ------
    RuntimeError
        If two columns end up with the same name after removing the type.
    """
    attributes_all: list[dict[str, Any]] = []
    specific_values_all: list[tuple[tuple[str, Any], ...]] = []

    with open(csv_path, encoding="utf-8-sig") as csvfile:
        reader = csv.DictReader(csvfile, delimiter=";")
        for row in reader:
            # Skip empty rows
            if not any(cell != "" for cell in row.values()):
                continue
            # Process the specific columns
            specific_values_list: list[tuple[str, Any]] = []
            for specific_column in specific_columns:
                specific_values_list.append(
                    csv_get_row_value(row=row, column=specific_column)
                )
                row.pop(specific_column)
            specific_values_all.append(tuple(specific_values_list))

            # Load as attributes the columns that contain a type
            attributes = {}
            for col_name_type in row.keys():
                # Skip columns that don't have a type
                if col_name_type.find(" [") == -1:
                    continue
                # Add the column and its value to the attributes
                col_name, col_value = csv_get_row_value(row=row, column=col_name_type)
                if col_name in attributes:
                    raise RuntimeError(
                        f"Two columns have the same name '{col_name}' in {str(csv_path)}"
                    )
                attributes[col_name] = col_value
            attributes_all.append(attributes)

    return attributes_all, specific_values_all

string_to_type(type_string)

Return the actual Python type corresponding to the string.

Parameters:

Name Type Description Default
type_string str

The type represented as a string.

required

Returns:

Type Description
type

The actual type.

Raises:

Type Description
NotImplementedError

If the given type is unsupported.

Source code in python/src/data_pipeline/utils/csv_utils.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def string_to_type(type_string: str) -> type:
    """
    Return the actual Python type corresponding to the string.

    Parameters
    ----------
    type_string : str
        The type represented as a string.

    Returns
    -------
    type
        The actual type.

    Raises
    ------
    NotImplementedError
        If the given type is unsupported.
    """
    if type_string == "str":
        real_type = str
    elif type_string == "float":
        real_type = float
    elif type_string == "int":
        real_type = int
    elif type_string == "bool":
        real_type = bool
    elif type_string.startswith("list"):
        real_type = list
    else:
        raise NotImplementedError(
            f"Support for type '{type_string}' is not implemented yet."
        )
    return real_type