This commit is contained in:
Daniil Fajnberg 2022-05-08 20:53:05 +02:00
parent 112a6c57ff
commit be3aebaf07
9 changed files with 713 additions and 3 deletions

View File

@ -0,0 +1,3 @@
tensorflow
numpy
matplotlib

View File

@ -1,8 +1,8 @@
[metadata]
name = ccaptchas
version = 0.0.1
author = Daniil
author_email = mail@placeholder123.to
author = Daniil Fajnberg
author_email = mail@daniil.fajnberg.de
description = Character CAPTCHA Solver
long_description = file: README.md
long_description_content_type = text/markdown
@ -19,7 +19,8 @@ package_dir =
packages = find:
python_requires = >=3
install_requires =
...
numpy
matplotlib
[options.extras_require]
dev =

110
src/ccaptchas/__main__.py Normal file
View File

@ -0,0 +1,110 @@
from argparse import ArgumentParser
from pathlib import Path
from typing import Any, Sequence
from .config import CONFIG
CMD = 'command'
TRAIN = 'train'
DATA_DIR = 'data_dir'
SAVE_DIR = 'save_dir'
FILE_EXTENSIONS = 'file_extensions'
BATCH_SIZE = 'batch_size'
NUM_EPOCHS = 'num_epochs'
EARLY_STOPPING_PATIENCE = 'early_stopping_patience'
INFER = 'infer'
MODEL_DIR = 'model_dir'
def ext_list(string: str) -> list[str]:
out = []
for ext in string.split(','):
ext = ext.strip()
if not ext.startswith('.'):
raise ValueError("Extensions must start with a dot")
out.append(ext)
return out
def parse_cli(args: Sequence[str] = None) -> dict[str, Any]:
parser = ArgumentParser(
prog=CONFIG.PROGRAM_NAME,
description="Character CAPTCHA Solver",
)
parser.add_argument(
'-E', f'--{FILE_EXTENSIONS.replace("_", "-")}',
default=CONFIG.DEFAULT_DATA_FILE_EXTENSIONS,
type=ext_list,
help=f"When used in `{TRAIN}` mode, extensions of the image files to be used for training/testing the model. "
f"When used in `{INFER}` mode, extensions of the image files to use the model on."
f"Defaults to {CONFIG.DEFAULT_DATA_FILE_EXTENSIONS}."
)
subparsers = parser.add_subparsers(dest=CMD)
parser_train = subparsers.add_parser(TRAIN, help="trains a new model")
parser_train.add_argument(
DATA_DIR,
type=Path,
help="Directory containing the image files to be used for training/testing the model."
)
parser_train.add_argument(
'-s', f'--{SAVE_DIR.replace("_", "-")}',
default=CONFIG.DEFAULT_SAVE_DIR,
type=Path,
help=f"Directory in which to save trained models. A subdirectory for each training session named with the "
f"current date and time will be created there and the model will be saved in that subdirectory. "
f"Defaults to '{CONFIG.DEFAULT_SAVE_DIR}'."
)
parser_train.add_argument(
'-b', f'--{BATCH_SIZE.replace("_", "-")}',
default=CONFIG.DEFAULT_BATCH_SIZE,
type=int,
help=f"The dataset will be divided into batches; this determines the number of images in each batch. "
f"Defaults to {CONFIG.DEFAULT_BATCH_SIZE}."
)
parser_train.add_argument(
'-n', f'--{NUM_EPOCHS.replace("_", "-")}',
default=CONFIG.DEFAULT_NUM_EPOCHS,
type=int,
help=f"The number of training epochs. Defaults to {CONFIG.DEFAULT_NUM_EPOCHS}."
)
parser_train.add_argument(
'-p', f'--{EARLY_STOPPING_PATIENCE.replace("_", "-")}',
default=CONFIG.DEFAULT_EARLY_STOPPING_PATIENCE,
type=int,
help=f"The number of training epochs with no improvement over a previously achieved optimum to allow before "
f"stopping training early (i.e. without completing all epochs). "
f"Defaults to {CONFIG.DEFAULT_EARLY_STOPPING_PATIENCE}."
)
parser_infer = subparsers.add_parser(INFER, help="uses an existing model to make inferences")
parser_infer.add_argument(
MODEL_DIR,
type=Path,
help="Directory containing the model to use for inference."
)
parser_infer.add_argument(
DATA_DIR,
type=Path,
help="Directory containing the image files to use the model on."
)
return vars(parser.parse_args(args))
def main() -> None:
kwargs = parse_cli()
cmd = kwargs.pop(CMD)
if cmd == TRAIN:
from .model import start
start(**kwargs)
elif cmd == INFER:
from .infer import start
start(**kwargs)
else:
raise NotImplemented
if __name__ == '__main__':
main()

20
src/ccaptchas/config.py Normal file
View File

@ -0,0 +1,20 @@
from pathlib import Path
class CONFIG(object):
__slots__ = ()
PROGRAM_NAME = 'ccaptchas'
DEFAULT_SAVE_DIR = Path('.', 'saved_models')
DEFAULT_DATA_FILE_EXTENSIONS = ('.jpg', '.png')
DEFAULT_BATCH_SIZE = 10
DEFAULT_NUM_EPOCHS = 100
DEFAULT_EARLY_STOPPING_PATIENCE = 10
VALIDATION_DATA_RATIO = 1 / 8
SHUFFLE_DATA = True
INPUT_LAYER_NAME_IMAGE, INPUT_LAYER_NAME_LABEL = 'image', 'label'
OUTPUT_LAYER_NAME = 'encoded_output'
MAX_STRING_LENGTH = 6 # Maximum number of character in any captcha image in the dataset
IMG_WIDTH, IMG_HEIGHT = 250, 50 # Desired image dimensions
VOCABULARY_FILE_NAME = '.vocabulary'

63
src/ccaptchas/infer.py Normal file
View File

@ -0,0 +1,63 @@
from pathlib import Path
from typing import Iterable
import numpy as np
import tensorflow as tf
from keras.api._v2.keras.models import Model, load_model
from keras.api._v2.keras.layers.experimental.preprocessing import StringLookup
from keras.api._v2.keras.backend import ctc_decode
from .config import CONFIG
from .preproc import encode_image, decode_label, plot_images
from .types import PathT
def images_to_input(*images) -> tf.data.Dataset:
array = np.array([encode_image(img) for img in images])
return tf.data.Dataset.from_tensor_slices(array)
def decode_batch_outputs(predictions, backward_lookup_table: StringLookup) -> list[str]:
input_len = np.ones(predictions.shape[0]) * predictions.shape[1]
# Use greedy search. For complex tasks, you can use beam search
sequences, _ = ctc_decode(predictions, input_length=input_len, greedy=True)
results = sequences[0][:, :CONFIG.MAX_STRING_LENGTH]
# Iterate over the results and get back the text
return [decode_label(result, backward_lookup_table) for result in results]
def load_inference_model(path: PathT) -> Model:
with open(Path(path, CONFIG.VOCABULARY_FILE_NAME), 'r') as vocab_file:
backward_lookup_table = StringLookup(vocabulary=list(vocab_file.read()), mask_token=None, invert=True)
saved_model = load_model(path)
inference_model = Model(
saved_model.get_layer(name=CONFIG.INPUT_LAYER_NAME_IMAGE).input,
saved_model.get_layer(name=CONFIG.OUTPUT_LAYER_NAME).output
)
def infer_and_decode(x: tf.data.Dataset) -> list[str]:
return decode_batch_outputs(predictions=inference_model.predict(x), backward_lookup_table=backward_lookup_table)
inference_model.infer_and_decode = infer_and_decode
inference_model.backward_lookup_table = backward_lookup_table
return inference_model
def start(model_dir: PathT, data_dir: PathT,
file_extensions: Iterable[str] = CONFIG.DEFAULT_DATA_FILE_EXTENSIONS) -> None:
data_dir = Path(data_dir)
file_paths = []
for ext in file_extensions:
file_paths.extend(data_dir.glob(f'*{ext}'))
file_paths.sort()
count = len(file_paths)
if count > 24:
raise ValueError("Too many files")
# images = []
# for path in file_paths:
# with open(path, 'rb') as f:
# images.append(f.read())
dataset = images_to_input(*file_paths)
model = load_inference_model(model_dir)
labels = model.infer_and_decode(dataset.batch(count))
plot_images(list(dataset.as_numpy_iterator()), labels=labels)

178
src/ccaptchas/model.py Normal file
View File

@ -0,0 +1,178 @@
import os
from datetime import datetime
from pathlib import Path
from typing import Iterable
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from .config import CONFIG
from .preproc import DatasetsInterface
from .types import PathT
THIS_DIR = os.path.dirname(os.path.realpath(__file__))
# Build paths relative to this file's directory like this: os_path.join(THIS_DIR, ...)
class CTCLayer(layers.Layer):
def __init__(self, name: str = None):
super().__init__(name=name)
self.loss_fn = keras.backend.ctc_batch_cost
def call(self, y_true: np.ndarray = None, y_pred: np.ndarray = None) -> np.ndarray:
# Compute the training-time loss value and add it
# to the layer using `self.add_loss()`.
batch_len = tf.cast(tf.shape(y_true)[0], dtype='int64')
input_length = tf.cast(tf.shape(y_pred)[1], dtype='int64')
label_length = tf.cast(tf.shape(y_true)[1], dtype='int64')
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype='int64')
label_length = label_length * tf.ones(shape=(batch_len, 1), dtype='int64')
loss = self.loss_fn(y_true, y_pred, input_length, label_length)
self.add_loss(loss)
# At test time, just return the computed predictions
return y_pred
# Factor by which the image is going to be downsampled
# by the convolutional blocks. We will be using two
# convolution blocks and each block will have
# a pooling layer which downsample the features by a factor of 2.
# Hence total downsampling factor would be 4.
downsample_factor = 4
def build_model(alphabet_size: int,
img_width: int = CONFIG.IMG_WIDTH,
img_height: int = CONFIG.IMG_HEIGHT,
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam()) -> keras.models.Model:
# Inputs to the model
input_img = layers.Input(
shape=(img_width, img_height, 1),
dtype='float32',
name=CONFIG.INPUT_LAYER_NAME_IMAGE
)
labels = layers.Input(
shape=(None, ),
dtype='float32',
name=CONFIG.INPUT_LAYER_NAME_LABEL,
)
# First conv block
x = layers.Conv2D(
filters=32,
kernel_size=(3, 3),
activation='relu',
kernel_initializer='he_normal',
padding='same',
name='conv1',
)(input_img)
x = layers.MaxPooling2D(
pool_size=(2, 2),
name='pool1'
)(x)
# Second conv block
x = layers.Conv2D(
filters=64,
kernel_size=(3, 3),
activation='relu',
kernel_initializer='he_normal',
padding='same',
name='conv2',
)(x)
x = layers.MaxPooling2D(
pool_size=(2, 2),
name='pool2'
)(x)
# We have used two max. pooling layers with pool size and strides 2.
# Hence, downsampled feature maps are 4x smaller. The number of
# filters in the last layer is 64. Reshape accordingly before
# passing the output to the RNN part of the model
new_shape = (
(img_width // 4),
(img_height // 4) * 64
)
x = layers.Reshape(
target_shape=new_shape,
name='reshape'
)(x)
x = layers.Dense(
units=64,
activation='relu',
name='dense1'
)(x)
x = layers.Dropout(rate=0.2)(x)
# RNNs
x = layers.Bidirectional(
layers.LSTM(
units=128,
return_sequences=True,
dropout=0.25,
)
)(x)
x = layers.Bidirectional(
layers.LSTM(
units=64,
return_sequences=True,
dropout=0.25,
)
)(x)
# Output layer
x = layers.Dense(
units=alphabet_size + 1,
activation='softmax',
name=CONFIG.OUTPUT_LAYER_NAME,
)(x)
# Add CTC layer for calculating CTC loss at each step
output = CTCLayer(name='ctc_loss')(labels, x)
# Define the model
model = keras.models.Model(
inputs=[input_img, labels],
outputs=output,
name='ocr_model_v1'
)
# Compile the model and return
model.compile(optimizer=optimizer)
return model
def train_model(model: keras.models.Model, train_dataset: tf.data.Dataset, valid_dataset: tf.data.Dataset,
num_epochs: int = CONFIG.DEFAULT_NUM_EPOCHS,
early_stopping_patience: int = CONFIG.DEFAULT_EARLY_STOPPING_PATIENCE) -> keras.callbacks.History:
# Add early stopping
early_stopping = keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=early_stopping_patience,
restore_best_weights=True,
)
# Train the model
history = model.fit(
x=train_dataset,
validation_data=valid_dataset,
epochs=num_epochs,
callbacks=[early_stopping],
)
return history
def start(data_dir: PathT, save_dir: PathT, file_extensions: Iterable[str] = CONFIG.DEFAULT_DATA_FILE_EXTENSIONS,
batch_size: int = CONFIG.DEFAULT_BATCH_SIZE, num_epochs: int = CONFIG.DEFAULT_NUM_EPOCHS,
early_stopping_patience: int = CONFIG.DEFAULT_EARLY_STOPPING_PATIENCE) -> None:
save_dir = Path(save_dir, datetime.now().strftime('%Y-%m-%d_%H-%M'))
save_dir.mkdir(parents=True)
print("\nConstructing datasets\n")
data_interface = DatasetsInterface(batch_size=int(batch_size), data_dir=data_dir, extensions=file_extensions)
data_interface.split_and_make_datasets()
print("\nBuilding model\n")
model = build_model(len(data_interface.characters))
print("\nBeginning training\n")
train_model(model, data_interface.training, data_interface.validation, num_epochs=num_epochs,
early_stopping_patience=early_stopping_patience)
print("\nSaving model\n")
model.save(save_dir)
print("\nSaving vocabulary\n")
vocabulary = ''.join(data_interface.forward_lookup_table.get_vocabulary())
with open(os.path.join(save_dir, CONFIG.VOCABULARY_FILE_NAME), 'w') as f:
f.write(vocabulary)
print("\nAll saved!\n")

330
src/ccaptchas/preproc.py Normal file
View File

@ -0,0 +1,330 @@
import os
import shutil
from pathlib import Path
from typing import Union, Mapping, Sequence, Iterable, Callable
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from .config import CONFIG
from .types import PathT
def label_files(src_dir: PathT, dest_dir: PathT, labels: Union[PathT, Sequence[str]],
reverse: bool = False, extensions: Iterable[str] = None) -> None:
"""
Copies files giving them new names by using specified labels.
All matching files are sorted by their file name before applying the sequence of labels to them.
The first file is named with the first label, the second is named with the second label, and so on.
If a label duplicate is encountered, a dot followed by a counter is appended to the file name
*preceding* the file extension, e.g. if 'some_label.jpg' exists, 'some_label.1.jpg' may be created.
The number of matching files must be greater than or equal to the number of labels.
Exactly one file is copied for every label; thus, after every label has been used, the operation ends.
Args:
src_dir:
Path to directory containing the files to be copied/renamed
dest_dir:
Path to destination directory
labels:
Either a sequence of labels (strings) or a path to a files containing the labels (newline separated)
reverse (optional):
Defines which file receives which label;
if False (default), the files in `img_dir` are sorted ascending by their file name,
if True, the files are sorted descending by name.
extensions (optional):
Iterable of file extensions; only files with these extensions will be considered.
"""
extensions = '' if extensions is None else tuple(extensions)
file_names = [name for name in os.listdir(src_dir) if name.endswith(extensions)]
file_names.sort(reverse=reverse)
try:
with open(labels, 'r') as f:
labels = f.read().strip().split('\n')
except TypeError:
pass # Assume, labels is already a sequence of strings
if not os.path.isdir(dest_dir):
raise NotADirectoryError(f"'{dest_dir}' is not a directory.")
if len(labels) > len(file_names):
raise IndexError(f"There are more labels ({len(labels)}) than files "
f"in the source directory ({len(file_names)} matching).")
for idx, label in enumerate(labels):
file_name = file_names[idx]
_, ext = os.path.splitext(file_name)
while True:
new_path = os.path.join(dest_dir, label + ext)
if not os.path.exists(new_path):
shutil.copyfile(os.path.join(src_dir, file_name), new_path)
break
pre_label, n = os.path.splitext(label)
try:
n = int(n[1:])
except ValueError:
label = label + '.1'
else:
label = pre_label + '.' + str(n + 1)
def load_images_data(data_dir: PathT, extensions: Iterable[str] = ('.jpg', '.png'), verbose: bool = True
) -> tuple[dict[str, str], str]:
"""
Creates a dictionary mapping file paths (of images) to their labels.
Everything up to the first dot in the filename is taken to be the label;
this naturally excludes file extensions, but also a filename like `ABC.1.jpg` results in the label `ABC`.
Also creates a vocabulary of characters encountered in the file names.
Args:
data_dir:
Path-like object or string to a directory containing the desired image files
extensions (optional):
Iterable of extensions that the files considered for the resulting data should be restricted to;
defaults to restricting finds to JPEG and PNG files.
verbose (optional):
If True, the function will print out a summary of the findings before returning.
Returns:
2-tuple with the first element being a dictionary where the keys are the file paths and the values are the
file names (i.e. image labels) and the second element being a string of all characters present in the labels.
"""
data_dir = Path(data_dir)
file_paths_and_labels, characters = {}, set()
for file_path in data_dir.iterdir():
if file_path.suffix not in extensions:
continue
label = file_path.name.split('.')[0]
for char in label:
characters.add(char)
file_paths_and_labels[str(file_path)] = label
if verbose:
print("Number of images/labels found: ", len(file_paths_and_labels))
print("Number of unique characters: ", len(characters))
print("Characters present: ", characters)
return file_paths_and_labels, ''.join(characters)
def get_vocab_maps(characters: Iterable[str], num_oov_indices: int = 0, mask_token: str = None
) -> tuple[layers.StringLookup, layers.StringLookup]:
"""
Constructs two table-based lookup objects that map characters to integers and back.
Details about the `StringLookup` class in the documentation:
https://tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/StringLookup
Args:
characters:
An iterable of strings representing the vocabulary to be mapped
num_oov_indices (optional):
Passed to the `IndexLookup` constructor;
defines the number of out-of-vocabulary (OOV) tokens to create;
assuming that no OOV characters will be encountered, the default is 0.
mask_token (optional):
Passed to the `IndexLookup` constructor;
the token representing missing values;
assuming that there will never be a value missing, the default is None.
Returns:
2-tuple of `StringLookup` objects, the first one mapping characters to integers, the second doing the reverse.
By default, no OOV or missing values are assumed to be encountered,
and thus each index (uniquely) represents a character from the vocabulary.
"""
char_to_int = layers.StringLookup(
vocabulary=list(characters),
num_oov_indices=num_oov_indices,
mask_token=mask_token,
)
int_to_char = layers.StringLookup(
vocabulary=char_to_int.get_vocabulary(),
mask_token=mask_token,
invert=True,
)
return char_to_int, int_to_char
def encode_image(img):
"""
Creates a `Tensor` object from an image file and transposes it.
"""
try:
# 0. Read image
img = tf.io.read_file(str(img))
except ValueError:
pass
# 1. Decode and convert to grayscale
img = tf.io.decode_png(img, channels=1)
# 2. Convert to float32 in [0, 1] range
img = tf.image.convert_image_dtype(img, tf.float32)
# 3. Resize to the desired size
img = tf.image.resize(img, [CONFIG.IMG_HEIGHT, CONFIG.IMG_WIDTH])
# 4. Transpose the image because we want the time
# dimension to correspond to the width of the image.
return tf.transpose(img, perm=[1, 0, 2])
def encode_label(label: str, forward_lookup_table: layers.StringLookup):
"""
Creates a `Tensor` object from a label string by passing passing its characters through a `StringLookup` instance.
"""
return forward_lookup_table(tf.strings.unicode_split(label, input_encoding='UTF-8'))
def decode_label(tensor, backward_lookup_table: layers.StringLookup) -> str:
return tf.strings.reduce_join(backward_lookup_table(tensor)).numpy().decode('utf-8')
def get_sample_encoder(forward_lookup_table: layers.StringLookup) -> Callable[[str, str], dict]:
"""
Returns a function for usage in the `map(...)` method of a `Dataset` instance.
The function will accept an image path and a label and return a dictionary;
the dictionary values will be a tensor representing the image and a tensor representing the label;
the keys for each are pre-configured and will correspond to the models input layers' names.
Args:
forward_lookup_table:
Passed to the `encode_label` function; required for mapping individual characters to floats.
Returns:
Callable taking two strings as arguments and returning a dictionary with string keys and Tensor values.
"""
def func(img_path: PathT, label: str) -> dict:
return {
CONFIG.INPUT_LAYER_NAME_IMAGE: encode_image(img_path),
CONFIG.INPUT_LAYER_NAME_LABEL: encode_label(label, forward_lookup_table)
}
return func
def make_dataset(file_paths: np.ndarray, labels: np.ndarray,
sample_encode_func: Callable[[str, str], dict], batch_size: int) -> tf.data.Dataset:
"""
Generates a `Dataset` instance from an array of image file paths and an array of corresponding labels.
Args:
file_paths:
Array of strings, each representing a path to an image file;
each of those paths will be passed into the function encoding one data sample (as the first argument).
labels:
Array of strings, each representing a label for an image pointed to by a file path
in the `file_paths` array with the corresponding index;
each of those labels will be passed into the function encoding one data sample (as the second argument).
sample_encode_func:
Will be passed as the `map_func` argument into the `map(...)` method of the new `Dataset`;
should be a function taking two strings (image path and label) as arguments and
returning a dictionary of Tensors representing the image and label.
batch_size:
Will be passed as the `batch_size` argument into the `batch(...)` method of the new `Dataset`;
determines how the dataset will be divided into batches.
Returns:
A `Dataset` ready to be fed into a model's `fit(...)` method, provided that model has two input layers
named in accordance with the keys of the dictionary returned by the `sample_encode_func` function.
"""
if file_paths.size != labels.size:
raise ValueError("Number of file paths must be equal to number of labels")
dataset = tf.data.Dataset.from_tensor_slices((file_paths, labels))
dataset = dataset.map(
map_func=sample_encode_func,
num_parallel_calls=tf.data.experimental.AUTOTUNE
).batch(
batch_size=batch_size
).prefetch(
buffer_size=tf.data.experimental.AUTOTUNE
)
return dataset
def get_datasets(file_paths_and_labels: Mapping[str, str], sample_encode_func: Callable[[str, str], dict],
batch_size: int, train_data_ratio: float, shuffle: bool = CONFIG.SHUFFLE_DATA
) -> tuple[tf.data.Dataset, tf.data.Dataset]:
"""
Creates a training dataset and a validation dataset from a mapping of image file paths to labels.
Args:
file_paths_and_labels:
Mapping with keys being image file paths and values being labels of the corresponding images;
this represents the full dataset used for fitting the model.
sample_encode_func:
Will be passed as the `sample_encode_func` argument into the `make_dataset(...)` function;
should be a function taking two strings (image path and label) as arguments and
returning a dictionary of Tensors representing the image and label.
batch_size:
Will be passed as the `batch_size` argument into the `make_dataset(...)` function;
determines how each dataset will be divided into batches.
train_data_ratio:
Floating point value between 0 and 1 determining what ratio of the full dataset will be used for training;
this implies that (1 - `train_data_ratio`) will be the ratio used for validation.
shuffle:
If True, the full dataset is shuffled pseudo-randomly before being split.
Returns:
Two `Dataset` objects ready to be fed into a model's `fit(...)` method, provided that model has two input layers
named in accordance with the keys of the dictionary returned by the `sample_encode_func` function.
"""
# 1. Get the total size of the dataset
size = len(file_paths_and_labels)
# 2. Make an indices array and shuffle it, if required
indices = np.arange(size)
if shuffle:
np.random.shuffle(indices)
# 3. Get the size of training samples; that will be the array cutoff index for the data-indices array
cutoff = int(size * train_data_ratio)
train_indices, valid_indices = indices[:cutoff], indices[cutoff:]
# 4. Split data into training and validation sets
file_paths, labels = np.array(list(file_paths_and_labels.keys())), np.array(list(file_paths_and_labels.values()))
x_train, x_valid = file_paths[train_indices], file_paths[valid_indices]
y_train, y_valid = labels[train_indices], labels[valid_indices]
# 5. Construct the actual Dataset-class objects
train_dataset = make_dataset(x_train, y_train, sample_encode_func, batch_size)
valid_dataset = make_dataset(x_valid, y_valid, sample_encode_func, batch_size)
return train_dataset, valid_dataset
def plot_images(images: Sequence[np.ndarray], labels: Sequence[str] = None, num_columns: int = 4,
transpose: bool = True) -> None:
if transpose:
images = tf.transpose(images, perm=[0, 2, 1, 3])
images = images[:, :, :, 0] * 255
images = images.numpy().astype('uint8')
num_rows = len(images) // num_columns or 1
_, axs = plt.subplots(num_rows, num_columns, figsize=(10, 5))
for idx, image in enumerate(images):
if num_rows == 1:
if num_columns == 1:
ax = axs
else:
ax = axs[idx // num_columns]
else:
ax = axs[idx // num_columns, idx % num_columns]
ax.imshow(image, cmap='gray')
if labels is not None:
ax.set_title(labels[idx])
ax.axis('off')
plt.show()
class DatasetsInterface:
"""
Convenience class for loading and pre-processing the training and validation data for usage with a model.
"""
def __init__(self, batch_size: int, data_dir: PathT,
extensions: Iterable[str] = CONFIG.DEFAULT_DATA_FILE_EXTENSIONS) -> None:
self.file_paths_and_labels, self.characters = load_images_data(data_dir=data_dir, extensions=extensions)
self.forward_lookup_table, self.backward_lookup_table = get_vocab_maps(characters=self.characters)
self.sample_encode_func = get_sample_encoder(forward_lookup_table=self.forward_lookup_table)
self.batch_size = batch_size
self.training, self.validation = None, None
def split_and_make_datasets(self, train_data_ratio: float = (1 - CONFIG.VALIDATION_DATA_RATIO),
shuffle: bool = CONFIG.SHUFFLE_DATA) -> None:
self.training, self.validation = get_datasets(file_paths_and_labels=self.file_paths_and_labels,
sample_encode_func=self.sample_encode_func,
batch_size=self.batch_size,
train_data_ratio=train_data_ratio,
shuffle=shuffle)

5
src/ccaptchas/types.py Normal file
View File

@ -0,0 +1,5 @@
from pathlib import Path
from typing import Union
PathT = Union[Path, str]