generated from daniil-berg/boilerplate-py
initial
This commit is contained in:
parent
112a6c57ff
commit
be3aebaf07
@ -0,0 +1,3 @@
|
|||||||
|
tensorflow
|
||||||
|
numpy
|
||||||
|
matplotlib
|
@ -1,8 +1,8 @@
|
|||||||
[metadata]
|
[metadata]
|
||||||
name = ccaptchas
|
name = ccaptchas
|
||||||
version = 0.0.1
|
version = 0.0.1
|
||||||
author = Daniil
|
author = Daniil Fajnberg
|
||||||
author_email = mail@placeholder123.to
|
author_email = mail@daniil.fajnberg.de
|
||||||
description = Character CAPTCHA Solver
|
description = Character CAPTCHA Solver
|
||||||
long_description = file: README.md
|
long_description = file: README.md
|
||||||
long_description_content_type = text/markdown
|
long_description_content_type = text/markdown
|
||||||
@ -19,7 +19,8 @@ package_dir =
|
|||||||
packages = find:
|
packages = find:
|
||||||
python_requires = >=3
|
python_requires = >=3
|
||||||
install_requires =
|
install_requires =
|
||||||
...
|
numpy
|
||||||
|
matplotlib
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
dev =
|
dev =
|
||||||
|
110
src/ccaptchas/__main__.py
Normal file
110
src/ccaptchas/__main__.py
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
from argparse import ArgumentParser
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Sequence
|
||||||
|
|
||||||
|
from .config import CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
CMD = 'command'
|
||||||
|
TRAIN = 'train'
|
||||||
|
DATA_DIR = 'data_dir'
|
||||||
|
SAVE_DIR = 'save_dir'
|
||||||
|
FILE_EXTENSIONS = 'file_extensions'
|
||||||
|
BATCH_SIZE = 'batch_size'
|
||||||
|
NUM_EPOCHS = 'num_epochs'
|
||||||
|
EARLY_STOPPING_PATIENCE = 'early_stopping_patience'
|
||||||
|
|
||||||
|
INFER = 'infer'
|
||||||
|
MODEL_DIR = 'model_dir'
|
||||||
|
|
||||||
|
|
||||||
|
def ext_list(string: str) -> list[str]:
|
||||||
|
out = []
|
||||||
|
for ext in string.split(','):
|
||||||
|
ext = ext.strip()
|
||||||
|
if not ext.startswith('.'):
|
||||||
|
raise ValueError("Extensions must start with a dot")
|
||||||
|
out.append(ext)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def parse_cli(args: Sequence[str] = None) -> dict[str, Any]:
|
||||||
|
parser = ArgumentParser(
|
||||||
|
prog=CONFIG.PROGRAM_NAME,
|
||||||
|
description="Character CAPTCHA Solver",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-E', f'--{FILE_EXTENSIONS.replace("_", "-")}',
|
||||||
|
default=CONFIG.DEFAULT_DATA_FILE_EXTENSIONS,
|
||||||
|
type=ext_list,
|
||||||
|
help=f"When used in `{TRAIN}` mode, extensions of the image files to be used for training/testing the model. "
|
||||||
|
f"When used in `{INFER}` mode, extensions of the image files to use the model on."
|
||||||
|
f"Defaults to {CONFIG.DEFAULT_DATA_FILE_EXTENSIONS}."
|
||||||
|
)
|
||||||
|
subparsers = parser.add_subparsers(dest=CMD)
|
||||||
|
|
||||||
|
parser_train = subparsers.add_parser(TRAIN, help="trains a new model")
|
||||||
|
parser_train.add_argument(
|
||||||
|
DATA_DIR,
|
||||||
|
type=Path,
|
||||||
|
help="Directory containing the image files to be used for training/testing the model."
|
||||||
|
)
|
||||||
|
parser_train.add_argument(
|
||||||
|
'-s', f'--{SAVE_DIR.replace("_", "-")}',
|
||||||
|
default=CONFIG.DEFAULT_SAVE_DIR,
|
||||||
|
type=Path,
|
||||||
|
help=f"Directory in which to save trained models. A subdirectory for each training session named with the "
|
||||||
|
f"current date and time will be created there and the model will be saved in that subdirectory. "
|
||||||
|
f"Defaults to '{CONFIG.DEFAULT_SAVE_DIR}'."
|
||||||
|
)
|
||||||
|
parser_train.add_argument(
|
||||||
|
'-b', f'--{BATCH_SIZE.replace("_", "-")}',
|
||||||
|
default=CONFIG.DEFAULT_BATCH_SIZE,
|
||||||
|
type=int,
|
||||||
|
help=f"The dataset will be divided into batches; this determines the number of images in each batch. "
|
||||||
|
f"Defaults to {CONFIG.DEFAULT_BATCH_SIZE}."
|
||||||
|
)
|
||||||
|
parser_train.add_argument(
|
||||||
|
'-n', f'--{NUM_EPOCHS.replace("_", "-")}',
|
||||||
|
default=CONFIG.DEFAULT_NUM_EPOCHS,
|
||||||
|
type=int,
|
||||||
|
help=f"The number of training epochs. Defaults to {CONFIG.DEFAULT_NUM_EPOCHS}."
|
||||||
|
)
|
||||||
|
parser_train.add_argument(
|
||||||
|
'-p', f'--{EARLY_STOPPING_PATIENCE.replace("_", "-")}',
|
||||||
|
default=CONFIG.DEFAULT_EARLY_STOPPING_PATIENCE,
|
||||||
|
type=int,
|
||||||
|
help=f"The number of training epochs with no improvement over a previously achieved optimum to allow before "
|
||||||
|
f"stopping training early (i.e. without completing all epochs). "
|
||||||
|
f"Defaults to {CONFIG.DEFAULT_EARLY_STOPPING_PATIENCE}."
|
||||||
|
)
|
||||||
|
|
||||||
|
parser_infer = subparsers.add_parser(INFER, help="uses an existing model to make inferences")
|
||||||
|
parser_infer.add_argument(
|
||||||
|
MODEL_DIR,
|
||||||
|
type=Path,
|
||||||
|
help="Directory containing the model to use for inference."
|
||||||
|
)
|
||||||
|
parser_infer.add_argument(
|
||||||
|
DATA_DIR,
|
||||||
|
type=Path,
|
||||||
|
help="Directory containing the image files to use the model on."
|
||||||
|
)
|
||||||
|
return vars(parser.parse_args(args))
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
kwargs = parse_cli()
|
||||||
|
cmd = kwargs.pop(CMD)
|
||||||
|
if cmd == TRAIN:
|
||||||
|
from .model import start
|
||||||
|
start(**kwargs)
|
||||||
|
elif cmd == INFER:
|
||||||
|
from .infer import start
|
||||||
|
start(**kwargs)
|
||||||
|
else:
|
||||||
|
raise NotImplemented
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
20
src/ccaptchas/config.py
Normal file
20
src/ccaptchas/config.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
class CONFIG(object):
|
||||||
|
__slots__ = ()
|
||||||
|
PROGRAM_NAME = 'ccaptchas'
|
||||||
|
|
||||||
|
DEFAULT_SAVE_DIR = Path('.', 'saved_models')
|
||||||
|
DEFAULT_DATA_FILE_EXTENSIONS = ('.jpg', '.png')
|
||||||
|
DEFAULT_BATCH_SIZE = 10
|
||||||
|
DEFAULT_NUM_EPOCHS = 100
|
||||||
|
DEFAULT_EARLY_STOPPING_PATIENCE = 10
|
||||||
|
|
||||||
|
VALIDATION_DATA_RATIO = 1 / 8
|
||||||
|
SHUFFLE_DATA = True
|
||||||
|
INPUT_LAYER_NAME_IMAGE, INPUT_LAYER_NAME_LABEL = 'image', 'label'
|
||||||
|
OUTPUT_LAYER_NAME = 'encoded_output'
|
||||||
|
MAX_STRING_LENGTH = 6 # Maximum number of character in any captcha image in the dataset
|
||||||
|
IMG_WIDTH, IMG_HEIGHT = 250, 50 # Desired image dimensions
|
||||||
|
VOCABULARY_FILE_NAME = '.vocabulary'
|
63
src/ccaptchas/infer.py
Normal file
63
src/ccaptchas/infer.py
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
from keras.api._v2.keras.models import Model, load_model
|
||||||
|
from keras.api._v2.keras.layers.experimental.preprocessing import StringLookup
|
||||||
|
from keras.api._v2.keras.backend import ctc_decode
|
||||||
|
|
||||||
|
from .config import CONFIG
|
||||||
|
from .preproc import encode_image, decode_label, plot_images
|
||||||
|
from .types import PathT
|
||||||
|
|
||||||
|
|
||||||
|
def images_to_input(*images) -> tf.data.Dataset:
|
||||||
|
array = np.array([encode_image(img) for img in images])
|
||||||
|
return tf.data.Dataset.from_tensor_slices(array)
|
||||||
|
|
||||||
|
|
||||||
|
def decode_batch_outputs(predictions, backward_lookup_table: StringLookup) -> list[str]:
|
||||||
|
input_len = np.ones(predictions.shape[0]) * predictions.shape[1]
|
||||||
|
# Use greedy search. For complex tasks, you can use beam search
|
||||||
|
sequences, _ = ctc_decode(predictions, input_length=input_len, greedy=True)
|
||||||
|
results = sequences[0][:, :CONFIG.MAX_STRING_LENGTH]
|
||||||
|
# Iterate over the results and get back the text
|
||||||
|
return [decode_label(result, backward_lookup_table) for result in results]
|
||||||
|
|
||||||
|
|
||||||
|
def load_inference_model(path: PathT) -> Model:
|
||||||
|
with open(Path(path, CONFIG.VOCABULARY_FILE_NAME), 'r') as vocab_file:
|
||||||
|
backward_lookup_table = StringLookup(vocabulary=list(vocab_file.read()), mask_token=None, invert=True)
|
||||||
|
saved_model = load_model(path)
|
||||||
|
inference_model = Model(
|
||||||
|
saved_model.get_layer(name=CONFIG.INPUT_LAYER_NAME_IMAGE).input,
|
||||||
|
saved_model.get_layer(name=CONFIG.OUTPUT_LAYER_NAME).output
|
||||||
|
)
|
||||||
|
|
||||||
|
def infer_and_decode(x: tf.data.Dataset) -> list[str]:
|
||||||
|
return decode_batch_outputs(predictions=inference_model.predict(x), backward_lookup_table=backward_lookup_table)
|
||||||
|
|
||||||
|
inference_model.infer_and_decode = infer_and_decode
|
||||||
|
inference_model.backward_lookup_table = backward_lookup_table
|
||||||
|
return inference_model
|
||||||
|
|
||||||
|
|
||||||
|
def start(model_dir: PathT, data_dir: PathT,
|
||||||
|
file_extensions: Iterable[str] = CONFIG.DEFAULT_DATA_FILE_EXTENSIONS) -> None:
|
||||||
|
data_dir = Path(data_dir)
|
||||||
|
file_paths = []
|
||||||
|
for ext in file_extensions:
|
||||||
|
file_paths.extend(data_dir.glob(f'*{ext}'))
|
||||||
|
file_paths.sort()
|
||||||
|
count = len(file_paths)
|
||||||
|
if count > 24:
|
||||||
|
raise ValueError("Too many files")
|
||||||
|
# images = []
|
||||||
|
# for path in file_paths:
|
||||||
|
# with open(path, 'rb') as f:
|
||||||
|
# images.append(f.read())
|
||||||
|
dataset = images_to_input(*file_paths)
|
||||||
|
model = load_inference_model(model_dir)
|
||||||
|
labels = model.infer_and_decode(dataset.batch(count))
|
||||||
|
plot_images(list(dataset.as_numpy_iterator()), labels=labels)
|
178
src/ccaptchas/model.py
Normal file
178
src/ccaptchas/model.py
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
from tensorflow import keras
|
||||||
|
from tensorflow.keras import layers
|
||||||
|
|
||||||
|
from .config import CONFIG
|
||||||
|
from .preproc import DatasetsInterface
|
||||||
|
from .types import PathT
|
||||||
|
|
||||||
|
|
||||||
|
THIS_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
# Build paths relative to this file's directory like this: os_path.join(THIS_DIR, ...)
|
||||||
|
|
||||||
|
|
||||||
|
class CTCLayer(layers.Layer):
|
||||||
|
def __init__(self, name: str = None):
|
||||||
|
super().__init__(name=name)
|
||||||
|
self.loss_fn = keras.backend.ctc_batch_cost
|
||||||
|
|
||||||
|
def call(self, y_true: np.ndarray = None, y_pred: np.ndarray = None) -> np.ndarray:
|
||||||
|
# Compute the training-time loss value and add it
|
||||||
|
# to the layer using `self.add_loss()`.
|
||||||
|
batch_len = tf.cast(tf.shape(y_true)[0], dtype='int64')
|
||||||
|
input_length = tf.cast(tf.shape(y_pred)[1], dtype='int64')
|
||||||
|
label_length = tf.cast(tf.shape(y_true)[1], dtype='int64')
|
||||||
|
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype='int64')
|
||||||
|
label_length = label_length * tf.ones(shape=(batch_len, 1), dtype='int64')
|
||||||
|
loss = self.loss_fn(y_true, y_pred, input_length, label_length)
|
||||||
|
self.add_loss(loss)
|
||||||
|
# At test time, just return the computed predictions
|
||||||
|
return y_pred
|
||||||
|
|
||||||
|
|
||||||
|
# Factor by which the image is going to be downsampled
|
||||||
|
# by the convolutional blocks. We will be using two
|
||||||
|
# convolution blocks and each block will have
|
||||||
|
# a pooling layer which downsample the features by a factor of 2.
|
||||||
|
# Hence total downsampling factor would be 4.
|
||||||
|
downsample_factor = 4
|
||||||
|
|
||||||
|
|
||||||
|
def build_model(alphabet_size: int,
|
||||||
|
img_width: int = CONFIG.IMG_WIDTH,
|
||||||
|
img_height: int = CONFIG.IMG_HEIGHT,
|
||||||
|
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam()) -> keras.models.Model:
|
||||||
|
# Inputs to the model
|
||||||
|
input_img = layers.Input(
|
||||||
|
shape=(img_width, img_height, 1),
|
||||||
|
dtype='float32',
|
||||||
|
name=CONFIG.INPUT_LAYER_NAME_IMAGE
|
||||||
|
)
|
||||||
|
labels = layers.Input(
|
||||||
|
shape=(None, ),
|
||||||
|
dtype='float32',
|
||||||
|
name=CONFIG.INPUT_LAYER_NAME_LABEL,
|
||||||
|
)
|
||||||
|
# First conv block
|
||||||
|
x = layers.Conv2D(
|
||||||
|
filters=32,
|
||||||
|
kernel_size=(3, 3),
|
||||||
|
activation='relu',
|
||||||
|
kernel_initializer='he_normal',
|
||||||
|
padding='same',
|
||||||
|
name='conv1',
|
||||||
|
)(input_img)
|
||||||
|
x = layers.MaxPooling2D(
|
||||||
|
pool_size=(2, 2),
|
||||||
|
name='pool1'
|
||||||
|
)(x)
|
||||||
|
# Second conv block
|
||||||
|
x = layers.Conv2D(
|
||||||
|
filters=64,
|
||||||
|
kernel_size=(3, 3),
|
||||||
|
activation='relu',
|
||||||
|
kernel_initializer='he_normal',
|
||||||
|
padding='same',
|
||||||
|
name='conv2',
|
||||||
|
)(x)
|
||||||
|
x = layers.MaxPooling2D(
|
||||||
|
pool_size=(2, 2),
|
||||||
|
name='pool2'
|
||||||
|
)(x)
|
||||||
|
# We have used two max. pooling layers with pool size and strides 2.
|
||||||
|
# Hence, downsampled feature maps are 4x smaller. The number of
|
||||||
|
# filters in the last layer is 64. Reshape accordingly before
|
||||||
|
# passing the output to the RNN part of the model
|
||||||
|
new_shape = (
|
||||||
|
(img_width // 4),
|
||||||
|
(img_height // 4) * 64
|
||||||
|
)
|
||||||
|
x = layers.Reshape(
|
||||||
|
target_shape=new_shape,
|
||||||
|
name='reshape'
|
||||||
|
)(x)
|
||||||
|
x = layers.Dense(
|
||||||
|
units=64,
|
||||||
|
activation='relu',
|
||||||
|
name='dense1'
|
||||||
|
)(x)
|
||||||
|
x = layers.Dropout(rate=0.2)(x)
|
||||||
|
# RNNs
|
||||||
|
x = layers.Bidirectional(
|
||||||
|
layers.LSTM(
|
||||||
|
units=128,
|
||||||
|
return_sequences=True,
|
||||||
|
dropout=0.25,
|
||||||
|
)
|
||||||
|
)(x)
|
||||||
|
x = layers.Bidirectional(
|
||||||
|
layers.LSTM(
|
||||||
|
units=64,
|
||||||
|
return_sequences=True,
|
||||||
|
dropout=0.25,
|
||||||
|
)
|
||||||
|
)(x)
|
||||||
|
# Output layer
|
||||||
|
x = layers.Dense(
|
||||||
|
units=alphabet_size + 1,
|
||||||
|
activation='softmax',
|
||||||
|
name=CONFIG.OUTPUT_LAYER_NAME,
|
||||||
|
)(x)
|
||||||
|
# Add CTC layer for calculating CTC loss at each step
|
||||||
|
output = CTCLayer(name='ctc_loss')(labels, x)
|
||||||
|
# Define the model
|
||||||
|
model = keras.models.Model(
|
||||||
|
inputs=[input_img, labels],
|
||||||
|
outputs=output,
|
||||||
|
name='ocr_model_v1'
|
||||||
|
)
|
||||||
|
# Compile the model and return
|
||||||
|
model.compile(optimizer=optimizer)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def train_model(model: keras.models.Model, train_dataset: tf.data.Dataset, valid_dataset: tf.data.Dataset,
|
||||||
|
num_epochs: int = CONFIG.DEFAULT_NUM_EPOCHS,
|
||||||
|
early_stopping_patience: int = CONFIG.DEFAULT_EARLY_STOPPING_PATIENCE) -> keras.callbacks.History:
|
||||||
|
# Add early stopping
|
||||||
|
early_stopping = keras.callbacks.EarlyStopping(
|
||||||
|
monitor='val_loss',
|
||||||
|
patience=early_stopping_patience,
|
||||||
|
restore_best_weights=True,
|
||||||
|
)
|
||||||
|
# Train the model
|
||||||
|
history = model.fit(
|
||||||
|
x=train_dataset,
|
||||||
|
validation_data=valid_dataset,
|
||||||
|
epochs=num_epochs,
|
||||||
|
callbacks=[early_stopping],
|
||||||
|
)
|
||||||
|
return history
|
||||||
|
|
||||||
|
|
||||||
|
def start(data_dir: PathT, save_dir: PathT, file_extensions: Iterable[str] = CONFIG.DEFAULT_DATA_FILE_EXTENSIONS,
|
||||||
|
batch_size: int = CONFIG.DEFAULT_BATCH_SIZE, num_epochs: int = CONFIG.DEFAULT_NUM_EPOCHS,
|
||||||
|
early_stopping_patience: int = CONFIG.DEFAULT_EARLY_STOPPING_PATIENCE) -> None:
|
||||||
|
save_dir = Path(save_dir, datetime.now().strftime('%Y-%m-%d_%H-%M'))
|
||||||
|
save_dir.mkdir(parents=True)
|
||||||
|
print("\nConstructing datasets\n")
|
||||||
|
data_interface = DatasetsInterface(batch_size=int(batch_size), data_dir=data_dir, extensions=file_extensions)
|
||||||
|
data_interface.split_and_make_datasets()
|
||||||
|
print("\nBuilding model\n")
|
||||||
|
model = build_model(len(data_interface.characters))
|
||||||
|
print("\nBeginning training\n")
|
||||||
|
train_model(model, data_interface.training, data_interface.validation, num_epochs=num_epochs,
|
||||||
|
early_stopping_patience=early_stopping_patience)
|
||||||
|
print("\nSaving model\n")
|
||||||
|
model.save(save_dir)
|
||||||
|
print("\nSaving vocabulary\n")
|
||||||
|
vocabulary = ''.join(data_interface.forward_lookup_table.get_vocabulary())
|
||||||
|
with open(os.path.join(save_dir, CONFIG.VOCABULARY_FILE_NAME), 'w') as f:
|
||||||
|
f.write(vocabulary)
|
||||||
|
print("\nAll saved!\n")
|
330
src/ccaptchas/preproc.py
Normal file
330
src/ccaptchas/preproc.py
Normal file
@ -0,0 +1,330 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union, Mapping, Sequence, Iterable, Callable
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
from tensorflow.keras import layers
|
||||||
|
|
||||||
|
from .config import CONFIG
|
||||||
|
from .types import PathT
|
||||||
|
|
||||||
|
|
||||||
|
def label_files(src_dir: PathT, dest_dir: PathT, labels: Union[PathT, Sequence[str]],
|
||||||
|
reverse: bool = False, extensions: Iterable[str] = None) -> None:
|
||||||
|
"""
|
||||||
|
Copies files giving them new names by using specified labels.
|
||||||
|
|
||||||
|
All matching files are sorted by their file name before applying the sequence of labels to them.
|
||||||
|
The first file is named with the first label, the second is named with the second label, and so on.
|
||||||
|
If a label duplicate is encountered, a dot followed by a counter is appended to the file name
|
||||||
|
*preceding* the file extension, e.g. if 'some_label.jpg' exists, 'some_label.1.jpg' may be created.
|
||||||
|
|
||||||
|
The number of matching files must be greater than or equal to the number of labels.
|
||||||
|
Exactly one file is copied for every label; thus, after every label has been used, the operation ends.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
src_dir:
|
||||||
|
Path to directory containing the files to be copied/renamed
|
||||||
|
dest_dir:
|
||||||
|
Path to destination directory
|
||||||
|
labels:
|
||||||
|
Either a sequence of labels (strings) or a path to a files containing the labels (newline separated)
|
||||||
|
reverse (optional):
|
||||||
|
Defines which file receives which label;
|
||||||
|
if False (default), the files in `img_dir` are sorted ascending by their file name,
|
||||||
|
if True, the files are sorted descending by name.
|
||||||
|
extensions (optional):
|
||||||
|
Iterable of file extensions; only files with these extensions will be considered.
|
||||||
|
|
||||||
|
"""
|
||||||
|
extensions = '' if extensions is None else tuple(extensions)
|
||||||
|
file_names = [name for name in os.listdir(src_dir) if name.endswith(extensions)]
|
||||||
|
file_names.sort(reverse=reverse)
|
||||||
|
try:
|
||||||
|
with open(labels, 'r') as f:
|
||||||
|
labels = f.read().strip().split('\n')
|
||||||
|
except TypeError:
|
||||||
|
pass # Assume, labels is already a sequence of strings
|
||||||
|
if not os.path.isdir(dest_dir):
|
||||||
|
raise NotADirectoryError(f"'{dest_dir}' is not a directory.")
|
||||||
|
if len(labels) > len(file_names):
|
||||||
|
raise IndexError(f"There are more labels ({len(labels)}) than files "
|
||||||
|
f"in the source directory ({len(file_names)} matching).")
|
||||||
|
for idx, label in enumerate(labels):
|
||||||
|
file_name = file_names[idx]
|
||||||
|
_, ext = os.path.splitext(file_name)
|
||||||
|
while True:
|
||||||
|
new_path = os.path.join(dest_dir, label + ext)
|
||||||
|
if not os.path.exists(new_path):
|
||||||
|
shutil.copyfile(os.path.join(src_dir, file_name), new_path)
|
||||||
|
break
|
||||||
|
pre_label, n = os.path.splitext(label)
|
||||||
|
try:
|
||||||
|
n = int(n[1:])
|
||||||
|
except ValueError:
|
||||||
|
label = label + '.1'
|
||||||
|
else:
|
||||||
|
label = pre_label + '.' + str(n + 1)
|
||||||
|
|
||||||
|
|
||||||
|
def load_images_data(data_dir: PathT, extensions: Iterable[str] = ('.jpg', '.png'), verbose: bool = True
|
||||||
|
) -> tuple[dict[str, str], str]:
|
||||||
|
"""
|
||||||
|
Creates a dictionary mapping file paths (of images) to their labels.
|
||||||
|
Everything up to the first dot in the filename is taken to be the label;
|
||||||
|
this naturally excludes file extensions, but also a filename like `ABC.1.jpg` results in the label `ABC`.
|
||||||
|
Also creates a vocabulary of characters encountered in the file names.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_dir:
|
||||||
|
Path-like object or string to a directory containing the desired image files
|
||||||
|
extensions (optional):
|
||||||
|
Iterable of extensions that the files considered for the resulting data should be restricted to;
|
||||||
|
defaults to restricting finds to JPEG and PNG files.
|
||||||
|
verbose (optional):
|
||||||
|
If True, the function will print out a summary of the findings before returning.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
2-tuple with the first element being a dictionary where the keys are the file paths and the values are the
|
||||||
|
file names (i.e. image labels) and the second element being a string of all characters present in the labels.
|
||||||
|
"""
|
||||||
|
data_dir = Path(data_dir)
|
||||||
|
file_paths_and_labels, characters = {}, set()
|
||||||
|
for file_path in data_dir.iterdir():
|
||||||
|
if file_path.suffix not in extensions:
|
||||||
|
continue
|
||||||
|
label = file_path.name.split('.')[0]
|
||||||
|
for char in label:
|
||||||
|
characters.add(char)
|
||||||
|
file_paths_and_labels[str(file_path)] = label
|
||||||
|
if verbose:
|
||||||
|
print("Number of images/labels found: ", len(file_paths_and_labels))
|
||||||
|
print("Number of unique characters: ", len(characters))
|
||||||
|
print("Characters present: ", characters)
|
||||||
|
return file_paths_and_labels, ''.join(characters)
|
||||||
|
|
||||||
|
|
||||||
|
def get_vocab_maps(characters: Iterable[str], num_oov_indices: int = 0, mask_token: str = None
|
||||||
|
) -> tuple[layers.StringLookup, layers.StringLookup]:
|
||||||
|
"""
|
||||||
|
Constructs two table-based lookup objects that map characters to integers and back.
|
||||||
|
|
||||||
|
Details about the `StringLookup` class in the documentation:
|
||||||
|
https://tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/StringLookup
|
||||||
|
|
||||||
|
Args:
|
||||||
|
characters:
|
||||||
|
An iterable of strings representing the vocabulary to be mapped
|
||||||
|
num_oov_indices (optional):
|
||||||
|
Passed to the `IndexLookup` constructor;
|
||||||
|
defines the number of out-of-vocabulary (OOV) tokens to create;
|
||||||
|
assuming that no OOV characters will be encountered, the default is 0.
|
||||||
|
mask_token (optional):
|
||||||
|
Passed to the `IndexLookup` constructor;
|
||||||
|
the token representing missing values;
|
||||||
|
assuming that there will never be a value missing, the default is None.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
2-tuple of `StringLookup` objects, the first one mapping characters to integers, the second doing the reverse.
|
||||||
|
By default, no OOV or missing values are assumed to be encountered,
|
||||||
|
and thus each index (uniquely) represents a character from the vocabulary.
|
||||||
|
"""
|
||||||
|
char_to_int = layers.StringLookup(
|
||||||
|
vocabulary=list(characters),
|
||||||
|
num_oov_indices=num_oov_indices,
|
||||||
|
mask_token=mask_token,
|
||||||
|
)
|
||||||
|
int_to_char = layers.StringLookup(
|
||||||
|
vocabulary=char_to_int.get_vocabulary(),
|
||||||
|
mask_token=mask_token,
|
||||||
|
invert=True,
|
||||||
|
)
|
||||||
|
return char_to_int, int_to_char
|
||||||
|
|
||||||
|
|
||||||
|
def encode_image(img):
|
||||||
|
"""
|
||||||
|
Creates a `Tensor` object from an image file and transposes it.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 0. Read image
|
||||||
|
img = tf.io.read_file(str(img))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
# 1. Decode and convert to grayscale
|
||||||
|
img = tf.io.decode_png(img, channels=1)
|
||||||
|
# 2. Convert to float32 in [0, 1] range
|
||||||
|
img = tf.image.convert_image_dtype(img, tf.float32)
|
||||||
|
# 3. Resize to the desired size
|
||||||
|
img = tf.image.resize(img, [CONFIG.IMG_HEIGHT, CONFIG.IMG_WIDTH])
|
||||||
|
# 4. Transpose the image because we want the time
|
||||||
|
# dimension to correspond to the width of the image.
|
||||||
|
return tf.transpose(img, perm=[1, 0, 2])
|
||||||
|
|
||||||
|
|
||||||
|
def encode_label(label: str, forward_lookup_table: layers.StringLookup):
|
||||||
|
"""
|
||||||
|
Creates a `Tensor` object from a label string by passing passing its characters through a `StringLookup` instance.
|
||||||
|
"""
|
||||||
|
return forward_lookup_table(tf.strings.unicode_split(label, input_encoding='UTF-8'))
|
||||||
|
|
||||||
|
|
||||||
|
def decode_label(tensor, backward_lookup_table: layers.StringLookup) -> str:
|
||||||
|
return tf.strings.reduce_join(backward_lookup_table(tensor)).numpy().decode('utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
def get_sample_encoder(forward_lookup_table: layers.StringLookup) -> Callable[[str, str], dict]:
|
||||||
|
"""
|
||||||
|
Returns a function for usage in the `map(...)` method of a `Dataset` instance.
|
||||||
|
|
||||||
|
The function will accept an image path and a label and return a dictionary;
|
||||||
|
the dictionary values will be a tensor representing the image and a tensor representing the label;
|
||||||
|
the keys for each are pre-configured and will correspond to the models input layers' names.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
forward_lookup_table:
|
||||||
|
Passed to the `encode_label` function; required for mapping individual characters to floats.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Callable taking two strings as arguments and returning a dictionary with string keys and Tensor values.
|
||||||
|
"""
|
||||||
|
def func(img_path: PathT, label: str) -> dict:
|
||||||
|
return {
|
||||||
|
CONFIG.INPUT_LAYER_NAME_IMAGE: encode_image(img_path),
|
||||||
|
CONFIG.INPUT_LAYER_NAME_LABEL: encode_label(label, forward_lookup_table)
|
||||||
|
}
|
||||||
|
return func
|
||||||
|
|
||||||
|
|
||||||
|
def make_dataset(file_paths: np.ndarray, labels: np.ndarray,
|
||||||
|
sample_encode_func: Callable[[str, str], dict], batch_size: int) -> tf.data.Dataset:
|
||||||
|
"""
|
||||||
|
Generates a `Dataset` instance from an array of image file paths and an array of corresponding labels.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_paths:
|
||||||
|
Array of strings, each representing a path to an image file;
|
||||||
|
each of those paths will be passed into the function encoding one data sample (as the first argument).
|
||||||
|
labels:
|
||||||
|
Array of strings, each representing a label for an image pointed to by a file path
|
||||||
|
in the `file_paths` array with the corresponding index;
|
||||||
|
each of those labels will be passed into the function encoding one data sample (as the second argument).
|
||||||
|
sample_encode_func:
|
||||||
|
Will be passed as the `map_func` argument into the `map(...)` method of the new `Dataset`;
|
||||||
|
should be a function taking two strings (image path and label) as arguments and
|
||||||
|
returning a dictionary of Tensors representing the image and label.
|
||||||
|
batch_size:
|
||||||
|
Will be passed as the `batch_size` argument into the `batch(...)` method of the new `Dataset`;
|
||||||
|
determines how the dataset will be divided into batches.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A `Dataset` ready to be fed into a model's `fit(...)` method, provided that model has two input layers
|
||||||
|
named in accordance with the keys of the dictionary returned by the `sample_encode_func` function.
|
||||||
|
"""
|
||||||
|
if file_paths.size != labels.size:
|
||||||
|
raise ValueError("Number of file paths must be equal to number of labels")
|
||||||
|
|
||||||
|
dataset = tf.data.Dataset.from_tensor_slices((file_paths, labels))
|
||||||
|
dataset = dataset.map(
|
||||||
|
map_func=sample_encode_func,
|
||||||
|
num_parallel_calls=tf.data.experimental.AUTOTUNE
|
||||||
|
).batch(
|
||||||
|
batch_size=batch_size
|
||||||
|
).prefetch(
|
||||||
|
buffer_size=tf.data.experimental.AUTOTUNE
|
||||||
|
)
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
def get_datasets(file_paths_and_labels: Mapping[str, str], sample_encode_func: Callable[[str, str], dict],
|
||||||
|
batch_size: int, train_data_ratio: float, shuffle: bool = CONFIG.SHUFFLE_DATA
|
||||||
|
) -> tuple[tf.data.Dataset, tf.data.Dataset]:
|
||||||
|
"""
|
||||||
|
Creates a training dataset and a validation dataset from a mapping of image file paths to labels.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_paths_and_labels:
|
||||||
|
Mapping with keys being image file paths and values being labels of the corresponding images;
|
||||||
|
this represents the full dataset used for fitting the model.
|
||||||
|
sample_encode_func:
|
||||||
|
Will be passed as the `sample_encode_func` argument into the `make_dataset(...)` function;
|
||||||
|
should be a function taking two strings (image path and label) as arguments and
|
||||||
|
returning a dictionary of Tensors representing the image and label.
|
||||||
|
batch_size:
|
||||||
|
Will be passed as the `batch_size` argument into the `make_dataset(...)` function;
|
||||||
|
determines how each dataset will be divided into batches.
|
||||||
|
train_data_ratio:
|
||||||
|
Floating point value between 0 and 1 determining what ratio of the full dataset will be used for training;
|
||||||
|
this implies that (1 - `train_data_ratio`) will be the ratio used for validation.
|
||||||
|
shuffle:
|
||||||
|
If True, the full dataset is shuffled pseudo-randomly before being split.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Two `Dataset` objects ready to be fed into a model's `fit(...)` method, provided that model has two input layers
|
||||||
|
named in accordance with the keys of the dictionary returned by the `sample_encode_func` function.
|
||||||
|
"""
|
||||||
|
# 1. Get the total size of the dataset
|
||||||
|
size = len(file_paths_and_labels)
|
||||||
|
# 2. Make an indices array and shuffle it, if required
|
||||||
|
indices = np.arange(size)
|
||||||
|
if shuffle:
|
||||||
|
np.random.shuffle(indices)
|
||||||
|
# 3. Get the size of training samples; that will be the array cutoff index for the data-indices array
|
||||||
|
cutoff = int(size * train_data_ratio)
|
||||||
|
train_indices, valid_indices = indices[:cutoff], indices[cutoff:]
|
||||||
|
# 4. Split data into training and validation sets
|
||||||
|
file_paths, labels = np.array(list(file_paths_and_labels.keys())), np.array(list(file_paths_and_labels.values()))
|
||||||
|
x_train, x_valid = file_paths[train_indices], file_paths[valid_indices]
|
||||||
|
y_train, y_valid = labels[train_indices], labels[valid_indices]
|
||||||
|
# 5. Construct the actual Dataset-class objects
|
||||||
|
train_dataset = make_dataset(x_train, y_train, sample_encode_func, batch_size)
|
||||||
|
valid_dataset = make_dataset(x_valid, y_valid, sample_encode_func, batch_size)
|
||||||
|
return train_dataset, valid_dataset
|
||||||
|
|
||||||
|
|
||||||
|
def plot_images(images: Sequence[np.ndarray], labels: Sequence[str] = None, num_columns: int = 4,
|
||||||
|
transpose: bool = True) -> None:
|
||||||
|
if transpose:
|
||||||
|
images = tf.transpose(images, perm=[0, 2, 1, 3])
|
||||||
|
images = images[:, :, :, 0] * 255
|
||||||
|
images = images.numpy().astype('uint8')
|
||||||
|
num_rows = len(images) // num_columns or 1
|
||||||
|
_, axs = plt.subplots(num_rows, num_columns, figsize=(10, 5))
|
||||||
|
for idx, image in enumerate(images):
|
||||||
|
if num_rows == 1:
|
||||||
|
if num_columns == 1:
|
||||||
|
ax = axs
|
||||||
|
else:
|
||||||
|
ax = axs[idx // num_columns]
|
||||||
|
else:
|
||||||
|
ax = axs[idx // num_columns, idx % num_columns]
|
||||||
|
ax.imshow(image, cmap='gray')
|
||||||
|
if labels is not None:
|
||||||
|
ax.set_title(labels[idx])
|
||||||
|
ax.axis('off')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
class DatasetsInterface:
|
||||||
|
"""
|
||||||
|
Convenience class for loading and pre-processing the training and validation data for usage with a model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, batch_size: int, data_dir: PathT,
|
||||||
|
extensions: Iterable[str] = CONFIG.DEFAULT_DATA_FILE_EXTENSIONS) -> None:
|
||||||
|
self.file_paths_and_labels, self.characters = load_images_data(data_dir=data_dir, extensions=extensions)
|
||||||
|
self.forward_lookup_table, self.backward_lookup_table = get_vocab_maps(characters=self.characters)
|
||||||
|
self.sample_encode_func = get_sample_encoder(forward_lookup_table=self.forward_lookup_table)
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.training, self.validation = None, None
|
||||||
|
|
||||||
|
def split_and_make_datasets(self, train_data_ratio: float = (1 - CONFIG.VALIDATION_DATA_RATIO),
|
||||||
|
shuffle: bool = CONFIG.SHUFFLE_DATA) -> None:
|
||||||
|
self.training, self.validation = get_datasets(file_paths_and_labels=self.file_paths_and_labels,
|
||||||
|
sample_encode_func=self.sample_encode_func,
|
||||||
|
batch_size=self.batch_size,
|
||||||
|
train_data_ratio=train_data_ratio,
|
||||||
|
shuffle=shuffle)
|
5
src/ccaptchas/types.py
Normal file
5
src/ccaptchas/types.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
|
||||||
|
PathT = Union[Path, str]
|
Loading…
Reference in New Issue
Block a user