generated from daniil-berg/boilerplate-py
331 lines
15 KiB
Python
331 lines
15 KiB
Python
import os
|
|
import shutil
|
|
from pathlib import Path
|
|
from typing import Union, Mapping, Sequence, Iterable, Callable
|
|
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
import tensorflow as tf
|
|
from tensorflow.keras import layers
|
|
|
|
from .config import CONFIG
|
|
from .types import PathT
|
|
|
|
|
|
def label_files(src_dir: PathT, dest_dir: PathT, labels: Union[PathT, Sequence[str]],
|
|
reverse: bool = False, extensions: Iterable[str] = None) -> None:
|
|
"""
|
|
Copies files giving them new names by using specified labels.
|
|
|
|
All matching files are sorted by their file name before applying the sequence of labels to them.
|
|
The first file is named with the first label, the second is named with the second label, and so on.
|
|
If a label duplicate is encountered, a dot followed by a counter is appended to the file name
|
|
*preceding* the file extension, e.g. if 'some_label.jpg' exists, 'some_label.1.jpg' may be created.
|
|
|
|
The number of matching files must be greater than or equal to the number of labels.
|
|
Exactly one file is copied for every label; thus, after every label has been used, the operation ends.
|
|
|
|
Args:
|
|
src_dir:
|
|
Path to directory containing the files to be copied/renamed
|
|
dest_dir:
|
|
Path to destination directory
|
|
labels:
|
|
Either a sequence of labels (strings) or a path to a files containing the labels (newline separated)
|
|
reverse (optional):
|
|
Defines which file receives which label;
|
|
if False (default), the files in `img_dir` are sorted ascending by their file name,
|
|
if True, the files are sorted descending by name.
|
|
extensions (optional):
|
|
Iterable of file extensions; only files with these extensions will be considered.
|
|
|
|
"""
|
|
extensions = '' if extensions is None else tuple(extensions)
|
|
file_names = [name for name in os.listdir(src_dir) if name.endswith(extensions)]
|
|
file_names.sort(reverse=reverse)
|
|
try:
|
|
with open(labels, 'r') as f:
|
|
labels = f.read().strip().split('\n')
|
|
except TypeError:
|
|
pass # Assume, labels is already a sequence of strings
|
|
if not os.path.isdir(dest_dir):
|
|
raise NotADirectoryError(f"'{dest_dir}' is not a directory.")
|
|
if len(labels) > len(file_names):
|
|
raise IndexError(f"There are more labels ({len(labels)}) than files "
|
|
f"in the source directory ({len(file_names)} matching).")
|
|
for idx, label in enumerate(labels):
|
|
file_name = file_names[idx]
|
|
_, ext = os.path.splitext(file_name)
|
|
while True:
|
|
new_path = os.path.join(dest_dir, label + ext)
|
|
if not os.path.exists(new_path):
|
|
shutil.copyfile(os.path.join(src_dir, file_name), new_path)
|
|
break
|
|
pre_label, n = os.path.splitext(label)
|
|
try:
|
|
n = int(n[1:])
|
|
except ValueError:
|
|
label = label + '.1'
|
|
else:
|
|
label = pre_label + '.' + str(n + 1)
|
|
|
|
|
|
def load_images_data(data_dir: PathT, extensions: Iterable[str] = ('.jpg', '.png'), verbose: bool = True
|
|
) -> tuple[dict[str, str], str]:
|
|
"""
|
|
Creates a dictionary mapping file paths (of images) to their labels.
|
|
Everything up to the first dot in the filename is taken to be the label;
|
|
this naturally excludes file extensions, but also a filename like `ABC.1.jpg` results in the label `ABC`.
|
|
Also creates a vocabulary of characters encountered in the file names.
|
|
|
|
Args:
|
|
data_dir:
|
|
Path-like object or string to a directory containing the desired image files
|
|
extensions (optional):
|
|
Iterable of extensions that the files considered for the resulting data should be restricted to;
|
|
defaults to restricting finds to JPEG and PNG files.
|
|
verbose (optional):
|
|
If True, the function will print out a summary of the findings before returning.
|
|
|
|
Returns:
|
|
2-tuple with the first element being a dictionary where the keys are the file paths and the values are the
|
|
file names (i.e. image labels) and the second element being a string of all characters present in the labels.
|
|
"""
|
|
data_dir = Path(data_dir)
|
|
file_paths_and_labels, characters = {}, set()
|
|
for file_path in data_dir.iterdir():
|
|
if file_path.suffix not in extensions:
|
|
continue
|
|
label = file_path.name.split('.')[0]
|
|
for char in label:
|
|
characters.add(char)
|
|
file_paths_and_labels[str(file_path)] = label
|
|
if verbose:
|
|
print("Number of images/labels found: ", len(file_paths_and_labels))
|
|
print("Number of unique characters: ", len(characters))
|
|
print("Characters present: ", characters)
|
|
return file_paths_and_labels, ''.join(characters)
|
|
|
|
|
|
def get_vocab_maps(characters: Iterable[str], num_oov_indices: int = 0, mask_token: str = None
|
|
) -> tuple[layers.StringLookup, layers.StringLookup]:
|
|
"""
|
|
Constructs two table-based lookup objects that map characters to integers and back.
|
|
|
|
Details about the `StringLookup` class in the documentation:
|
|
https://tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/StringLookup
|
|
|
|
Args:
|
|
characters:
|
|
An iterable of strings representing the vocabulary to be mapped
|
|
num_oov_indices (optional):
|
|
Passed to the `IndexLookup` constructor;
|
|
defines the number of out-of-vocabulary (OOV) tokens to create;
|
|
assuming that no OOV characters will be encountered, the default is 0.
|
|
mask_token (optional):
|
|
Passed to the `IndexLookup` constructor;
|
|
the token representing missing values;
|
|
assuming that there will never be a value missing, the default is None.
|
|
|
|
Returns:
|
|
2-tuple of `StringLookup` objects, the first one mapping characters to integers, the second doing the reverse.
|
|
By default, no OOV or missing values are assumed to be encountered,
|
|
and thus each index (uniquely) represents a character from the vocabulary.
|
|
"""
|
|
char_to_int = layers.StringLookup(
|
|
vocabulary=list(characters),
|
|
num_oov_indices=num_oov_indices,
|
|
mask_token=mask_token,
|
|
)
|
|
int_to_char = layers.StringLookup(
|
|
vocabulary=char_to_int.get_vocabulary(),
|
|
mask_token=mask_token,
|
|
invert=True,
|
|
)
|
|
return char_to_int, int_to_char
|
|
|
|
|
|
def encode_image(img):
|
|
"""
|
|
Creates a `Tensor` object from an image file and transposes it.
|
|
"""
|
|
try:
|
|
# 0. Read image
|
|
img = tf.io.read_file(str(img))
|
|
except ValueError:
|
|
pass
|
|
# 1. Decode and convert to grayscale
|
|
img = tf.io.decode_png(img, channels=1)
|
|
# 2. Convert to float32 in [0, 1] range
|
|
img = tf.image.convert_image_dtype(img, tf.float32)
|
|
# 3. Resize to the desired size
|
|
img = tf.image.resize(img, [CONFIG.IMG_HEIGHT, CONFIG.IMG_WIDTH])
|
|
# 4. Transpose the image because we want the time
|
|
# dimension to correspond to the width of the image.
|
|
return tf.transpose(img, perm=[1, 0, 2])
|
|
|
|
|
|
def encode_label(label: str, forward_lookup_table: layers.StringLookup):
|
|
"""
|
|
Creates a `Tensor` object from a label string by passing passing its characters through a `StringLookup` instance.
|
|
"""
|
|
return forward_lookup_table(tf.strings.unicode_split(label, input_encoding='UTF-8'))
|
|
|
|
|
|
def decode_label(tensor, backward_lookup_table: layers.StringLookup) -> str:
|
|
return tf.strings.reduce_join(backward_lookup_table(tensor)).numpy().decode('utf-8')
|
|
|
|
|
|
def get_sample_encoder(forward_lookup_table: layers.StringLookup) -> Callable[[str, str], dict]:
|
|
"""
|
|
Returns a function for usage in the `map(...)` method of a `Dataset` instance.
|
|
|
|
The function will accept an image path and a label and return a dictionary;
|
|
the dictionary values will be a tensor representing the image and a tensor representing the label;
|
|
the keys for each are pre-configured and will correspond to the models input layers' names.
|
|
|
|
Args:
|
|
forward_lookup_table:
|
|
Passed to the `encode_label` function; required for mapping individual characters to floats.
|
|
|
|
Returns:
|
|
Callable taking two strings as arguments and returning a dictionary with string keys and Tensor values.
|
|
"""
|
|
def func(img_path: PathT, label: str) -> dict:
|
|
return {
|
|
CONFIG.INPUT_LAYER_NAME_IMAGE: encode_image(img_path),
|
|
CONFIG.INPUT_LAYER_NAME_LABEL: encode_label(label, forward_lookup_table)
|
|
}
|
|
return func
|
|
|
|
|
|
def make_dataset(file_paths: np.ndarray, labels: np.ndarray,
|
|
sample_encode_func: Callable[[str, str], dict], batch_size: int) -> tf.data.Dataset:
|
|
"""
|
|
Generates a `Dataset` instance from an array of image file paths and an array of corresponding labels.
|
|
|
|
Args:
|
|
file_paths:
|
|
Array of strings, each representing a path to an image file;
|
|
each of those paths will be passed into the function encoding one data sample (as the first argument).
|
|
labels:
|
|
Array of strings, each representing a label for an image pointed to by a file path
|
|
in the `file_paths` array with the corresponding index;
|
|
each of those labels will be passed into the function encoding one data sample (as the second argument).
|
|
sample_encode_func:
|
|
Will be passed as the `map_func` argument into the `map(...)` method of the new `Dataset`;
|
|
should be a function taking two strings (image path and label) as arguments and
|
|
returning a dictionary of Tensors representing the image and label.
|
|
batch_size:
|
|
Will be passed as the `batch_size` argument into the `batch(...)` method of the new `Dataset`;
|
|
determines how the dataset will be divided into batches.
|
|
|
|
Returns:
|
|
A `Dataset` ready to be fed into a model's `fit(...)` method, provided that model has two input layers
|
|
named in accordance with the keys of the dictionary returned by the `sample_encode_func` function.
|
|
"""
|
|
if file_paths.size != labels.size:
|
|
raise ValueError("Number of file paths must be equal to number of labels")
|
|
|
|
dataset = tf.data.Dataset.from_tensor_slices((file_paths, labels))
|
|
dataset = dataset.map(
|
|
map_func=sample_encode_func,
|
|
num_parallel_calls=tf.data.experimental.AUTOTUNE
|
|
).batch(
|
|
batch_size=batch_size
|
|
).prefetch(
|
|
buffer_size=tf.data.experimental.AUTOTUNE
|
|
)
|
|
return dataset
|
|
|
|
|
|
def get_datasets(file_paths_and_labels: Mapping[str, str], sample_encode_func: Callable[[str, str], dict],
|
|
batch_size: int, train_data_ratio: float, shuffle: bool = CONFIG.SHUFFLE_DATA
|
|
) -> tuple[tf.data.Dataset, tf.data.Dataset]:
|
|
"""
|
|
Creates a training dataset and a validation dataset from a mapping of image file paths to labels.
|
|
|
|
Args:
|
|
file_paths_and_labels:
|
|
Mapping with keys being image file paths and values being labels of the corresponding images;
|
|
this represents the full dataset used for fitting the model.
|
|
sample_encode_func:
|
|
Will be passed as the `sample_encode_func` argument into the `make_dataset(...)` function;
|
|
should be a function taking two strings (image path and label) as arguments and
|
|
returning a dictionary of Tensors representing the image and label.
|
|
batch_size:
|
|
Will be passed as the `batch_size` argument into the `make_dataset(...)` function;
|
|
determines how each dataset will be divided into batches.
|
|
train_data_ratio:
|
|
Floating point value between 0 and 1 determining what ratio of the full dataset will be used for training;
|
|
this implies that (1 - `train_data_ratio`) will be the ratio used for validation.
|
|
shuffle:
|
|
If True, the full dataset is shuffled pseudo-randomly before being split.
|
|
|
|
Returns:
|
|
Two `Dataset` objects ready to be fed into a model's `fit(...)` method, provided that model has two input layers
|
|
named in accordance with the keys of the dictionary returned by the `sample_encode_func` function.
|
|
"""
|
|
# 1. Get the total size of the dataset
|
|
size = len(file_paths_and_labels)
|
|
# 2. Make an indices array and shuffle it, if required
|
|
indices = np.arange(size)
|
|
if shuffle:
|
|
np.random.shuffle(indices)
|
|
# 3. Get the size of training samples; that will be the array cutoff index for the data-indices array
|
|
cutoff = int(size * train_data_ratio)
|
|
train_indices, valid_indices = indices[:cutoff], indices[cutoff:]
|
|
# 4. Split data into training and validation sets
|
|
file_paths, labels = np.array(list(file_paths_and_labels.keys())), np.array(list(file_paths_and_labels.values()))
|
|
x_train, x_valid = file_paths[train_indices], file_paths[valid_indices]
|
|
y_train, y_valid = labels[train_indices], labels[valid_indices]
|
|
# 5. Construct the actual Dataset-class objects
|
|
train_dataset = make_dataset(x_train, y_train, sample_encode_func, batch_size)
|
|
valid_dataset = make_dataset(x_valid, y_valid, sample_encode_func, batch_size)
|
|
return train_dataset, valid_dataset
|
|
|
|
|
|
def plot_images(images: Sequence[np.ndarray], labels: Sequence[str] = None, num_columns: int = 4,
|
|
transpose: bool = True) -> None:
|
|
if transpose:
|
|
images = tf.transpose(images, perm=[0, 2, 1, 3])
|
|
images = images[:, :, :, 0] * 255
|
|
images = images.numpy().astype('uint8')
|
|
num_rows = len(images) // num_columns or 1
|
|
_, axs = plt.subplots(num_rows, num_columns, figsize=(10, 5))
|
|
for idx, image in enumerate(images):
|
|
if num_rows == 1:
|
|
if num_columns == 1:
|
|
ax = axs
|
|
else:
|
|
ax = axs[idx // num_columns]
|
|
else:
|
|
ax = axs[idx // num_columns, idx % num_columns]
|
|
ax.imshow(image, cmap='gray')
|
|
if labels is not None:
|
|
ax.set_title(labels[idx])
|
|
ax.axis('off')
|
|
plt.show()
|
|
|
|
|
|
class DatasetsInterface:
|
|
"""
|
|
Convenience class for loading and pre-processing the training and validation data for usage with a model.
|
|
"""
|
|
|
|
def __init__(self, batch_size: int, data_dir: PathT,
|
|
extensions: Iterable[str] = CONFIG.DEFAULT_DATA_FILE_EXTENSIONS) -> None:
|
|
self.file_paths_and_labels, self.characters = load_images_data(data_dir=data_dir, extensions=extensions)
|
|
self.forward_lookup_table, self.backward_lookup_table = get_vocab_maps(characters=self.characters)
|
|
self.sample_encode_func = get_sample_encoder(forward_lookup_table=self.forward_lookup_table)
|
|
self.batch_size = batch_size
|
|
self.training, self.validation = None, None
|
|
|
|
def split_and_make_datasets(self, train_data_ratio: float = (1 - CONFIG.VALIDATION_DATA_RATIO),
|
|
shuffle: bool = CONFIG.SHUFFLE_DATA) -> None:
|
|
self.training, self.validation = get_datasets(file_paths_and_labels=self.file_paths_and_labels,
|
|
sample_encode_func=self.sample_encode_func,
|
|
batch_size=self.batch_size,
|
|
train_data_ratio=train_data_ratio,
|
|
shuffle=shuffle)
|