generated from daniil-berg/boilerplate-py
refactor entire package; add CLI
This commit is contained in:
parent
be3aebaf07
commit
7c9f97f21b
2
.gitignore
vendored
2
.gitignore
vendored
@ -8,3 +8,5 @@
|
||||
/dist/
|
||||
# Python cache:
|
||||
__pycache__/
|
||||
|
||||
saved_models/
|
||||
|
@ -6,16 +6,26 @@ from .config import CONFIG
|
||||
|
||||
|
||||
CMD = 'command'
|
||||
|
||||
TRAIN = 'train'
|
||||
|
||||
DATA_DIR = 'data_dir'
|
||||
SAVE_DIR = 'save_dir'
|
||||
FILE_EXTENSIONS = 'file_extensions'
|
||||
FILE_EXT = 'file_ext'
|
||||
BATCH_SIZE = 'batch_size'
|
||||
VALIDATION_RATIO = 'validation_ratio'
|
||||
IMG_WIDTH = 'img_width'
|
||||
IMG_HEIGHT = 'img_height'
|
||||
NUM_EPOCHS = 'num_epochs'
|
||||
EARLY_STOPPING_PATIENCE = 'early_stopping_patience'
|
||||
_PREPROCESSING_KEYS = (DATA_DIR, FILE_EXT, BATCH_SIZE, VALIDATION_RATIO, IMG_WIDTH, IMG_HEIGHT)
|
||||
_TRAINING_KEYS = (SAVE_DIR, NUM_EPOCHS, EARLY_STOPPING_PATIENCE)
|
||||
|
||||
INFER = 'infer'
|
||||
MODEL_DIR = 'model_dir'
|
||||
IMAGES_DIR = 'images_dir'
|
||||
IMAGE_FILES = 'image_files'
|
||||
PLOT_RESULTS = 'plot_results'
|
||||
|
||||
|
||||
def ext_list(string: str) -> list[str]:
|
||||
@ -34,14 +44,14 @@ def parse_cli(args: Sequence[str] = None) -> dict[str, Any]:
|
||||
description="Character CAPTCHA Solver",
|
||||
)
|
||||
parser.add_argument(
|
||||
'-E', f'--{FILE_EXTENSIONS.replace("_", "-")}',
|
||||
default=CONFIG.DEFAULT_DATA_FILE_EXTENSIONS,
|
||||
'-E', f'--{FILE_EXT.replace("_", "-")}',
|
||||
default=CONFIG.DEFAULT_IMG_FILE_EXT,
|
||||
type=ext_list,
|
||||
help=f"When used in `{TRAIN}` mode, extensions of the image files to be used for training/testing the model. "
|
||||
f"When used in `{INFER}` mode, extensions of the image files to use the model on."
|
||||
f"Defaults to {CONFIG.DEFAULT_DATA_FILE_EXTENSIONS}."
|
||||
f"Defaults to {CONFIG.DEFAULT_IMG_FILE_EXT}."
|
||||
)
|
||||
subparsers = parser.add_subparsers(dest=CMD)
|
||||
subparsers = parser.add_subparsers(dest=CMD, title="Commands")
|
||||
|
||||
parser_train = subparsers.add_parser(TRAIN, help="trains a new model")
|
||||
parser_train.add_argument(
|
||||
@ -49,7 +59,9 @@ def parse_cli(args: Sequence[str] = None) -> dict[str, Any]:
|
||||
type=Path,
|
||||
help="Directory containing the image files to be used for training/testing the model."
|
||||
)
|
||||
parser_train.add_argument(
|
||||
preprocessing_group = parser_train.add_argument_group("Preprocessing options")
|
||||
training_group = parser_train.add_argument_group("Training options")
|
||||
training_group.add_argument(
|
||||
'-s', f'--{SAVE_DIR.replace("_", "-")}',
|
||||
default=CONFIG.DEFAULT_SAVE_DIR,
|
||||
type=Path,
|
||||
@ -57,20 +69,40 @@ def parse_cli(args: Sequence[str] = None) -> dict[str, Any]:
|
||||
f"current date and time will be created there and the model will be saved in that subdirectory. "
|
||||
f"Defaults to '{CONFIG.DEFAULT_SAVE_DIR}'."
|
||||
)
|
||||
parser_train.add_argument(
|
||||
preprocessing_group.add_argument(
|
||||
'-b', f'--{BATCH_SIZE.replace("_", "-")}',
|
||||
default=CONFIG.DEFAULT_BATCH_SIZE,
|
||||
type=int,
|
||||
help=f"The dataset will be divided into batches; this determines the number of images in each batch. "
|
||||
f"Defaults to {CONFIG.DEFAULT_BATCH_SIZE}."
|
||||
)
|
||||
parser_train.add_argument(
|
||||
preprocessing_group.add_argument(
|
||||
'-r', f'--{VALIDATION_RATIO.replace("_", "-")}',
|
||||
default=CONFIG.DEFAULT_VALIDATION_RATIO,
|
||||
type=float,
|
||||
help=f"The dataset will split into training and validation data; this argument should be a float between 0 "
|
||||
f"and 1 determining the relative size of the validation dataset to the whole dataset. "
|
||||
f"Defaults to {round(CONFIG.DEFAULT_VALIDATION_RATIO, 3)}."
|
||||
)
|
||||
preprocessing_group.add_argument(
|
||||
'-W', f'--{IMG_WIDTH.replace("_", "-")}',
|
||||
default=CONFIG.DEFAULT_IMG_WIDTH,
|
||||
type=int,
|
||||
help=f"The width of an image in pixels. Defaults to {CONFIG.DEFAULT_IMG_WIDTH}."
|
||||
)
|
||||
preprocessing_group.add_argument(
|
||||
'-H', f'--{IMG_HEIGHT.replace("_", "-")}',
|
||||
default=CONFIG.DEFAULT_IMG_HEIGHT,
|
||||
type=int,
|
||||
help=f"The height of an image in pixels. Defaults to {CONFIG.DEFAULT_IMG_HEIGHT}."
|
||||
)
|
||||
training_group.add_argument(
|
||||
'-n', f'--{NUM_EPOCHS.replace("_", "-")}',
|
||||
default=CONFIG.DEFAULT_NUM_EPOCHS,
|
||||
type=int,
|
||||
help=f"The number of training epochs. Defaults to {CONFIG.DEFAULT_NUM_EPOCHS}."
|
||||
)
|
||||
parser_train.add_argument(
|
||||
training_group.add_argument(
|
||||
'-p', f'--{EARLY_STOPPING_PATIENCE.replace("_", "-")}',
|
||||
default=CONFIG.DEFAULT_EARLY_STOPPING_PATIENCE,
|
||||
type=int,
|
||||
@ -85,10 +117,24 @@ def parse_cli(args: Sequence[str] = None) -> dict[str, Any]:
|
||||
type=Path,
|
||||
help="Directory containing the model to use for inference."
|
||||
)
|
||||
parser_infer.add_argument(
|
||||
DATA_DIR,
|
||||
data_group = parser_infer.add_mutually_exclusive_group()
|
||||
data_group.add_argument(
|
||||
'-f', f'--{IMAGE_FILES.replace("_", "-")}',
|
||||
type=Path,
|
||||
help="Directory containing the image files to use the model on."
|
||||
nargs='*',
|
||||
metavar='PATH',
|
||||
help="Paths to image files to use the model on."
|
||||
)
|
||||
data_group.add_argument(
|
||||
'-d', f'--{IMAGES_DIR.replace("_", "-")}',
|
||||
type=Path,
|
||||
metavar='PATH',
|
||||
help="Path to directory containing the image files to use the model on."
|
||||
)
|
||||
parser_infer.add_argument(
|
||||
'-p', f'--{PLOT_RESULTS.replace("_", "-")}',
|
||||
action='store_true',
|
||||
help="If set, a plot will be displayed, showing the images with the inferred labels."
|
||||
)
|
||||
return vars(parser.parse_args(args))
|
||||
|
||||
@ -98,12 +144,15 @@ def main() -> None:
|
||||
cmd = kwargs.pop(CMD)
|
||||
if cmd == TRAIN:
|
||||
from .model import start
|
||||
start(**kwargs)
|
||||
from .preprocess import load_datasets
|
||||
pre_kwargs = {k: kwargs.pop(k) for k in _PREPROCESSING_KEYS}
|
||||
training_data, validation_data, vocabulary = load_datasets(**pre_kwargs)
|
||||
start(training_data, validation_data, vocabulary, **kwargs)
|
||||
elif cmd == INFER:
|
||||
from .infer import start
|
||||
start(**kwargs)
|
||||
else:
|
||||
raise NotImplemented
|
||||
raise SystemExit # Should be unreachable since argument parser will throw an error earlier
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -5,16 +5,29 @@ class CONFIG(object):
|
||||
__slots__ = ()
|
||||
PROGRAM_NAME = 'ccaptchas'
|
||||
|
||||
DEFAULT_SAVE_DIR = Path('.', 'saved_models')
|
||||
DEFAULT_DATA_FILE_EXTENSIONS = ('.jpg', '.png')
|
||||
DEFAULT_BATCH_SIZE = 10
|
||||
EXT_PNG, EXT_JPG = '.png', '.jpg'
|
||||
DEFAULT_IMG_FILE_EXT = (EXT_PNG, EXT_JPG)
|
||||
|
||||
# StringLookup parameters:
|
||||
DEFAULT_NUM_OOV_INDICES = 0 # assuming no out-of-vocabulary (OOV) characters will be encountered
|
||||
DEFAULT_MASK_TOKEN = None # assuming there will never be a value missing
|
||||
|
||||
# Data splitting:
|
||||
DEFAULT_VALIDATION_RATIO = 1 / 8
|
||||
DEFAULT_SHUFFLE_DATA = True
|
||||
|
||||
# Image processing:
|
||||
DEFAULT_IMG_WIDTH, DEFAULT_IMG_HEIGHT = 250, 50 # Desired image dimensions
|
||||
|
||||
# Training hyper-parameters:
|
||||
DEFAULT_BATCH_SIZE = 16
|
||||
DEFAULT_NUM_EPOCHS = 100
|
||||
DEFAULT_EARLY_STOPPING_PATIENCE = 10
|
||||
|
||||
VALIDATION_DATA_RATIO = 1 / 8
|
||||
SHUFFLE_DATA = True
|
||||
INPUT_LAYER_NAME_IMAGE, INPUT_LAYER_NAME_LABEL = 'image', 'label'
|
||||
OUTPUT_LAYER_NAME = 'encoded_output'
|
||||
DEFAULT_SAVE_DIR = Path('.', 'saved_models')
|
||||
MODEL_NAME = f'{PROGRAM_NAME}_model'
|
||||
LAYER_NAME_INPUT_IMAGE, LAYER_NAME_INPUT_LABEL = 'image', 'label'
|
||||
LAYER_NAME_OUTPUT = 'encoded_output'
|
||||
MAX_STRING_LENGTH = 6 # Maximum number of character in any captcha image in the dataset
|
||||
IMG_WIDTH, IMG_HEIGHT = 250, 50 # Desired image dimensions
|
||||
VOCABULARY_FILE_NAME = '.vocabulary'
|
||||
HISTORY_FILE_NAME = '.history.json'
|
||||
|
24
src/ccaptchas/ctc_layer.py
Normal file
24
src/ccaptchas/ctc_layer.py
Normal file
@ -0,0 +1,24 @@
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
|
||||
from .keras.backend import ctc_batch_cost
|
||||
from .keras.layers import Layer
|
||||
|
||||
|
||||
class CTCLayer(Layer):
|
||||
def __init__(self, name: str = None):
|
||||
super().__init__(name=name)
|
||||
self.loss_fn = ctc_batch_cost
|
||||
|
||||
def call(self, y_true: np.ndarray = None, y_pred: np.ndarray = None) -> np.ndarray:
|
||||
# Compute the training-time loss value and add it
|
||||
# to the layer using `self.add_loss()`.
|
||||
batch_len = tf.cast(tf.shape(y_true)[0], dtype='int64')
|
||||
input_length = tf.cast(tf.shape(y_pred)[1], dtype='int64')
|
||||
label_length = tf.cast(tf.shape(y_true)[1], dtype='int64')
|
||||
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype='int64')
|
||||
label_length = label_length * tf.ones(shape=(batch_len, 1), dtype='int64')
|
||||
loss = self.loss_fn(y_true, y_pred, input_length, label_length)
|
||||
self.add_loss(loss)
|
||||
# At test time, just return the computed predictions
|
||||
return y_pred
|
@ -1,63 +1,78 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from typing import Iterable, Sequence
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from keras.api._v2.keras.models import Model, load_model
|
||||
from keras.api._v2.keras.layers.experimental.preprocessing import StringLookup
|
||||
from keras.api._v2.keras.layers import StringLookup
|
||||
from keras.api._v2.keras.backend import ctc_decode
|
||||
|
||||
from .config import CONFIG
|
||||
from .preproc import encode_image, decode_label, plot_images
|
||||
from .types import PathT
|
||||
from .preprocess import process_image, decode_label, find_image_files, get_lookup_table
|
||||
from .types import PathT, ImgT, Array
|
||||
from .visualize import plot_images
|
||||
|
||||
|
||||
def images_to_input(*images) -> tf.data.Dataset:
|
||||
array = np.array([encode_image(img) for img in images])
|
||||
return tf.data.Dataset.from_tensor_slices(array)
|
||||
def process_predictions(predictions: tf.Tensor) -> tf.Tensor:
|
||||
num_predictions = predictions.shape[0] # corresponds to the number of images passed into the model for inference
|
||||
output_width = predictions.shape[1] # corresponds to the (down-sampled) width of an image
|
||||
# It is worth noting that `predictions.shape[2]` corresponds to the size of the vocabulary + 1,
|
||||
# i.e. one more than the number of distinct characters that can occur in a label.
|
||||
|
||||
# Since the `predictions` tensor is the output of a softmax activation function, we need to decode the values along
|
||||
# the "width axis" from arrays of floats between 0 and 1 to single integers representing the inferred characters.
|
||||
# (see CTC concepts)
|
||||
|
||||
# Construct 1D array, each element representing the width of a single prediction, i.e. the down-sampled image width:
|
||||
seq_lengths = np.ones(num_predictions) * output_width
|
||||
# Retrieve the sequences of label indices inferred by the model:
|
||||
sequences, _probabilities = ctc_decode(predictions, input_length=seq_lengths, greedy=True)
|
||||
# Since we use a greedy approach, only one sequence per prediction is returned, so we discard the other dimensions:
|
||||
sequences = sequences[0]
|
||||
# Now this is a 2D tensor, for which `sequences.shape[0]` corresponds to the number of samples/images,
|
||||
# while `sequences.shape[1]` corresponds to the size of the vocabulary + 1.
|
||||
# Assuming n characters were inferred, the first n elements of each array will be the label indices of those
|
||||
# characters, whereas the rest of the elements will be -1, implying blank labels. Since we know the maximum length
|
||||
# a string of characters in an image can have, we can discard all those labels, that must be blank.
|
||||
# What we are then left with, will be an array of relevant label indices for each image passed through the model.
|
||||
# Using a backward lookup table, these can later be easily decoded to the actual characters.
|
||||
return sequences[:, :CONFIG.MAX_STRING_LENGTH]
|
||||
|
||||
|
||||
def decode_batch_outputs(predictions, backward_lookup_table: StringLookup) -> list[str]:
|
||||
input_len = np.ones(predictions.shape[0]) * predictions.shape[1]
|
||||
# Use greedy search. For complex tasks, you can use beam search
|
||||
sequences, _ = ctc_decode(predictions, input_length=input_len, greedy=True)
|
||||
results = sequences[0][:, :CONFIG.MAX_STRING_LENGTH]
|
||||
# Iterate over the results and get back the text
|
||||
return [decode_label(result, backward_lookup_table) for result in results]
|
||||
|
||||
|
||||
def load_inference_model(path: PathT) -> Model:
|
||||
with open(Path(path, CONFIG.VOCABULARY_FILE_NAME), 'r') as vocab_file:
|
||||
backward_lookup_table = StringLookup(vocabulary=list(vocab_file.read()), mask_token=None, invert=True)
|
||||
saved_model = load_model(path)
|
||||
def load_inference_model(model_dir: PathT) -> tuple[Model, StringLookup]:
|
||||
with open(Path(model_dir, CONFIG.VOCABULARY_FILE_NAME), 'r') as vocab_file:
|
||||
backward_lookup = get_lookup_table(vocab_file.read(), invert=True)
|
||||
saved_model = load_model(model_dir)
|
||||
inference_model = Model(
|
||||
saved_model.get_layer(name=CONFIG.INPUT_LAYER_NAME_IMAGE).input,
|
||||
saved_model.get_layer(name=CONFIG.OUTPUT_LAYER_NAME).output
|
||||
saved_model.get_layer(name=CONFIG.LAYER_NAME_INPUT_IMAGE).input,
|
||||
saved_model.get_layer(name=CONFIG.LAYER_NAME_OUTPUT).output
|
||||
)
|
||||
|
||||
def infer_and_decode(x: tf.data.Dataset) -> list[str]:
|
||||
return decode_batch_outputs(predictions=inference_model.predict(x), backward_lookup_table=backward_lookup_table)
|
||||
|
||||
inference_model.infer_and_decode = infer_and_decode
|
||||
inference_model.backward_lookup_table = backward_lookup_table
|
||||
return inference_model
|
||||
return inference_model, backward_lookup
|
||||
|
||||
|
||||
def start(model_dir: PathT, data_dir: PathT,
|
||||
file_extensions: Iterable[str] = CONFIG.DEFAULT_DATA_FILE_EXTENSIONS) -> None:
|
||||
data_dir = Path(data_dir)
|
||||
file_paths = []
|
||||
for ext in file_extensions:
|
||||
file_paths.extend(data_dir.glob(f'*{ext}'))
|
||||
file_paths.sort()
|
||||
count = len(file_paths)
|
||||
if count > 24:
|
||||
raise ValueError("Too many files")
|
||||
# images = []
|
||||
# for path in file_paths:
|
||||
# with open(path, 'rb') as f:
|
||||
# images.append(f.read())
|
||||
dataset = images_to_input(*file_paths)
|
||||
model = load_inference_model(model_dir)
|
||||
labels = model.infer_and_decode(dataset.batch(count))
|
||||
plot_images(list(dataset.as_numpy_iterator()), labels=labels)
|
||||
def predict_and_decode(images: Sequence[ImgT], model: Model, backward_lookup: StringLookup) -> tuple[Array, list[str]]:
|
||||
dataset = np.array([process_image(img) for img in images])
|
||||
encoded_labels = process_predictions(model.predict(dataset))
|
||||
return dataset, [decode_label(label, backward_lookup) for label in encoded_labels]
|
||||
|
||||
|
||||
def load_and_infer(images: Sequence[ImgT], model_dir: PathT, plot_results: bool = False) -> list[str]:
|
||||
model, backward_lookup = load_inference_model(model_dir)
|
||||
images, labels = predict_and_decode(images, model, backward_lookup)
|
||||
if plot_results:
|
||||
per_plot = 24
|
||||
for i in range(0, len(images), per_plot):
|
||||
plot_images(images[i:(i + per_plot)], labels=labels[i:(i + per_plot)])
|
||||
return labels
|
||||
|
||||
|
||||
def start(model_dir: PathT, image_files: Sequence[Path] = (), images_dir: PathT = None,
|
||||
file_ext: Iterable[str] = CONFIG.DEFAULT_IMG_FILE_EXT, plot_results: bool = False) -> None:
|
||||
if images_dir is not None:
|
||||
image_files = sorted(find_image_files(images_dir, file_ext=file_ext))
|
||||
if not image_files:
|
||||
image_files = [sys.stdin.buffer.read()]
|
||||
labels = load_and_infer(image_files, model_dir, plot_results=plot_results)
|
||||
for label in labels:
|
||||
print(label)
|
||||
|
0
src/ccaptchas/keras/__init__.py
Normal file
0
src/ccaptchas/keras/__init__.py
Normal file
1
src/ccaptchas/keras/backend.py
Normal file
1
src/ccaptchas/keras/backend.py
Normal file
@ -0,0 +1 @@
|
||||
from keras.api._v2.keras.backend import *
|
1
src/ccaptchas/keras/callbacks.py
Normal file
1
src/ccaptchas/keras/callbacks.py
Normal file
@ -0,0 +1 @@
|
||||
from keras.api._v2.keras.callbacks import *
|
1
src/ccaptchas/keras/layers.py
Normal file
1
src/ccaptchas/keras/layers.py
Normal file
@ -0,0 +1 @@
|
||||
from keras.api._v2.keras.layers import *
|
1
src/ccaptchas/keras/models.py
Normal file
1
src/ccaptchas/keras/models.py
Normal file
@ -0,0 +1 @@
|
||||
from keras.api._v2.keras.models import *
|
1
src/ccaptchas/keras/optimizers.py
Normal file
1
src/ccaptchas/keras/optimizers.py
Normal file
@ -0,0 +1 @@
|
||||
from keras.api._v2.keras.optimizers import *
|
@ -1,66 +1,40 @@
|
||||
import os
|
||||
import logging
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow import keras
|
||||
from tensorflow.keras import layers
|
||||
|
||||
from .config import CONFIG
|
||||
from .preproc import DatasetsInterface
|
||||
from .ctc_layer import CTCLayer
|
||||
from .keras.callbacks import EarlyStopping, History
|
||||
from .keras.layers import Bidirectional, Conv2D, Dense, Dropout, Input, LSTM, MaxPooling2D, Reshape
|
||||
from .keras.models import Model
|
||||
from .keras.optimizers import Adam, Optimizer
|
||||
from .types import PathT
|
||||
|
||||
|
||||
THIS_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
# Build paths relative to this file's directory like this: os_path.join(THIS_DIR, ...)
|
||||
|
||||
|
||||
class CTCLayer(layers.Layer):
|
||||
def __init__(self, name: str = None):
|
||||
super().__init__(name=name)
|
||||
self.loss_fn = keras.backend.ctc_batch_cost
|
||||
|
||||
def call(self, y_true: np.ndarray = None, y_pred: np.ndarray = None) -> np.ndarray:
|
||||
# Compute the training-time loss value and add it
|
||||
# to the layer using `self.add_loss()`.
|
||||
batch_len = tf.cast(tf.shape(y_true)[0], dtype='int64')
|
||||
input_length = tf.cast(tf.shape(y_pred)[1], dtype='int64')
|
||||
label_length = tf.cast(tf.shape(y_true)[1], dtype='int64')
|
||||
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype='int64')
|
||||
label_length = label_length * tf.ones(shape=(batch_len, 1), dtype='int64')
|
||||
loss = self.loss_fn(y_true, y_pred, input_length, label_length)
|
||||
self.add_loss(loss)
|
||||
# At test time, just return the computed predictions
|
||||
return y_pred
|
||||
|
||||
|
||||
# Factor by which the image is going to be downsampled
|
||||
# by the convolutional blocks. We will be using two
|
||||
# convolution blocks and each block will have
|
||||
# a pooling layer which downsample the features by a factor of 2.
|
||||
# Hence total downsampling factor would be 4.
|
||||
downsample_factor = 4
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def build_model(alphabet_size: int,
|
||||
img_width: int = CONFIG.IMG_WIDTH,
|
||||
img_height: int = CONFIG.IMG_HEIGHT,
|
||||
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam()) -> keras.models.Model:
|
||||
img_width: int = CONFIG.DEFAULT_IMG_WIDTH,
|
||||
img_height: int = CONFIG.DEFAULT_IMG_HEIGHT,
|
||||
optimizer: Optimizer = Adam()) -> Model:
|
||||
log.info("Building model")
|
||||
# Inputs to the model
|
||||
input_img = layers.Input(
|
||||
input_img = Input(
|
||||
shape=(img_width, img_height, 1),
|
||||
dtype='float32',
|
||||
name=CONFIG.INPUT_LAYER_NAME_IMAGE
|
||||
name=CONFIG.LAYER_NAME_INPUT_IMAGE
|
||||
)
|
||||
labels = layers.Input(
|
||||
labels = Input(
|
||||
shape=(None, ),
|
||||
dtype='float32',
|
||||
name=CONFIG.INPUT_LAYER_NAME_LABEL,
|
||||
name=CONFIG.LAYER_NAME_INPUT_LABEL,
|
||||
)
|
||||
# First conv block
|
||||
x = layers.Conv2D(
|
||||
x = Conv2D(
|
||||
filters=32,
|
||||
kernel_size=(3, 3),
|
||||
activation='relu',
|
||||
@ -68,12 +42,12 @@ def build_model(alphabet_size: int,
|
||||
padding='same',
|
||||
name='conv1',
|
||||
)(input_img)
|
||||
x = layers.MaxPooling2D(
|
||||
x = MaxPooling2D(
|
||||
pool_size=(2, 2),
|
||||
name='pool1'
|
||||
)(x)
|
||||
# Second conv block
|
||||
x = layers.Conv2D(
|
||||
x = Conv2D(
|
||||
filters=64,
|
||||
kernel_size=(3, 3),
|
||||
activation='relu',
|
||||
@ -81,72 +55,72 @@ def build_model(alphabet_size: int,
|
||||
padding='same',
|
||||
name='conv2',
|
||||
)(x)
|
||||
x = layers.MaxPooling2D(
|
||||
x = MaxPooling2D(
|
||||
pool_size=(2, 2),
|
||||
name='pool2'
|
||||
)(x)
|
||||
# We have used two max. pooling layers with pool size and strides 2.
|
||||
# Hence, downsampled feature maps are 4x smaller. The number of
|
||||
# Hence, down-sampled feature maps are 4x smaller. The number of
|
||||
# filters in the last layer is 64. Reshape accordingly before
|
||||
# passing the output to the RNN part of the model
|
||||
down_sample_factor = 4
|
||||
new_shape = (
|
||||
(img_width // 4),
|
||||
(img_height // 4) * 64
|
||||
(img_width // down_sample_factor),
|
||||
(img_height // down_sample_factor) * 64
|
||||
)
|
||||
x = layers.Reshape(
|
||||
x = Reshape(
|
||||
target_shape=new_shape,
|
||||
name='reshape'
|
||||
)(x)
|
||||
x = layers.Dense(
|
||||
x = Dense(
|
||||
units=64,
|
||||
activation='relu',
|
||||
name='dense1'
|
||||
)(x)
|
||||
x = layers.Dropout(rate=0.2)(x)
|
||||
x = Dropout(rate=0.2)(x)
|
||||
# RNNs
|
||||
x = layers.Bidirectional(
|
||||
layers.LSTM(
|
||||
x = Bidirectional(
|
||||
LSTM(
|
||||
units=128,
|
||||
return_sequences=True,
|
||||
dropout=0.25,
|
||||
)
|
||||
)(x)
|
||||
x = layers.Bidirectional(
|
||||
layers.LSTM(
|
||||
x = Bidirectional(
|
||||
LSTM(
|
||||
units=64,
|
||||
return_sequences=True,
|
||||
dropout=0.25,
|
||||
)
|
||||
)(x)
|
||||
# Output layer
|
||||
x = layers.Dense(
|
||||
x = Dense(
|
||||
units=alphabet_size + 1,
|
||||
activation='softmax',
|
||||
name=CONFIG.OUTPUT_LAYER_NAME,
|
||||
name=CONFIG.LAYER_NAME_OUTPUT,
|
||||
)(x)
|
||||
# Add CTC layer for calculating CTC loss at each step
|
||||
output = CTCLayer(name='ctc_loss')(labels, x)
|
||||
# Define the model
|
||||
model = keras.models.Model(
|
||||
model = Model(
|
||||
inputs=[input_img, labels],
|
||||
outputs=output,
|
||||
name='ocr_model_v1'
|
||||
name=CONFIG.MODEL_NAME
|
||||
)
|
||||
# Compile the model and return
|
||||
log.debug("Compiling model")
|
||||
model.compile(optimizer=optimizer)
|
||||
return model
|
||||
|
||||
|
||||
def train_model(model: keras.models.Model, train_dataset: tf.data.Dataset, valid_dataset: tf.data.Dataset,
|
||||
def train_model(model: Model, train_dataset: tf.data.Dataset, valid_dataset: tf.data.Dataset,
|
||||
num_epochs: int = CONFIG.DEFAULT_NUM_EPOCHS,
|
||||
early_stopping_patience: int = CONFIG.DEFAULT_EARLY_STOPPING_PATIENCE) -> keras.callbacks.History:
|
||||
# Add early stopping
|
||||
early_stopping = keras.callbacks.EarlyStopping(
|
||||
early_stopping_patience: int = CONFIG.DEFAULT_EARLY_STOPPING_PATIENCE) -> History:
|
||||
early_stopping = EarlyStopping(
|
||||
monitor='val_loss',
|
||||
patience=early_stopping_patience,
|
||||
restore_best_weights=True,
|
||||
)
|
||||
# Train the model
|
||||
log.debug("Beginning training")
|
||||
history = model.fit(
|
||||
x=train_dataset,
|
||||
validation_data=valid_dataset,
|
||||
@ -156,23 +130,20 @@ def train_model(model: keras.models.Model, train_dataset: tf.data.Dataset, valid
|
||||
return history
|
||||
|
||||
|
||||
def start(data_dir: PathT, save_dir: PathT, file_extensions: Iterable[str] = CONFIG.DEFAULT_DATA_FILE_EXTENSIONS,
|
||||
batch_size: int = CONFIG.DEFAULT_BATCH_SIZE, num_epochs: int = CONFIG.DEFAULT_NUM_EPOCHS,
|
||||
def start(training_data: tf.data.Dataset, validation_data: tf.data.Dataset, vocabulary: str,
|
||||
save_dir: PathT = CONFIG.DEFAULT_SAVE_DIR, num_epochs: int = CONFIG.DEFAULT_NUM_EPOCHS,
|
||||
early_stopping_patience: int = CONFIG.DEFAULT_EARLY_STOPPING_PATIENCE) -> None:
|
||||
save_dir = Path(save_dir, datetime.now().strftime('%Y-%m-%d_%H-%M'))
|
||||
save_dir.mkdir(parents=True)
|
||||
print("\nConstructing datasets\n")
|
||||
data_interface = DatasetsInterface(batch_size=int(batch_size), data_dir=data_dir, extensions=file_extensions)
|
||||
data_interface.split_and_make_datasets()
|
||||
print("\nBuilding model\n")
|
||||
model = build_model(len(data_interface.characters))
|
||||
print("\nBeginning training\n")
|
||||
train_model(model, data_interface.training, data_interface.validation, num_epochs=num_epochs,
|
||||
early_stopping_patience=early_stopping_patience)
|
||||
print("\nSaving model\n")
|
||||
model = build_model(len(vocabulary))
|
||||
history = train_model(model, training_data, validation_data,
|
||||
num_epochs=num_epochs, early_stopping_patience=early_stopping_patience)
|
||||
log.debug("Saving model")
|
||||
model.save(save_dir)
|
||||
print("\nSaving vocabulary\n")
|
||||
vocabulary = ''.join(data_interface.forward_lookup_table.get_vocabulary())
|
||||
with open(os.path.join(save_dir, CONFIG.VOCABULARY_FILE_NAME), 'w') as f:
|
||||
log.debug("Saving vocabulary")
|
||||
with open(Path(save_dir, CONFIG.VOCABULARY_FILE_NAME), 'w') as f:
|
||||
f.write(vocabulary)
|
||||
print("\nAll saved!\n")
|
||||
log.debug("Saving history")
|
||||
with open(Path(save_dir, CONFIG.HISTORY_FILE_NAME), 'w') as f:
|
||||
json.dump(history.history, f, indent=4)
|
||||
log.info("All saved!")
|
||||
|
@ -1,330 +0,0 @@
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Union, Mapping, Sequence, Iterable, Callable
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow.keras import layers
|
||||
|
||||
from .config import CONFIG
|
||||
from .types import PathT
|
||||
|
||||
|
||||
def label_files(src_dir: PathT, dest_dir: PathT, labels: Union[PathT, Sequence[str]],
|
||||
reverse: bool = False, extensions: Iterable[str] = None) -> None:
|
||||
"""
|
||||
Copies files giving them new names by using specified labels.
|
||||
|
||||
All matching files are sorted by their file name before applying the sequence of labels to them.
|
||||
The first file is named with the first label, the second is named with the second label, and so on.
|
||||
If a label duplicate is encountered, a dot followed by a counter is appended to the file name
|
||||
*preceding* the file extension, e.g. if 'some_label.jpg' exists, 'some_label.1.jpg' may be created.
|
||||
|
||||
The number of matching files must be greater than or equal to the number of labels.
|
||||
Exactly one file is copied for every label; thus, after every label has been used, the operation ends.
|
||||
|
||||
Args:
|
||||
src_dir:
|
||||
Path to directory containing the files to be copied/renamed
|
||||
dest_dir:
|
||||
Path to destination directory
|
||||
labels:
|
||||
Either a sequence of labels (strings) or a path to a files containing the labels (newline separated)
|
||||
reverse (optional):
|
||||
Defines which file receives which label;
|
||||
if False (default), the files in `img_dir` are sorted ascending by their file name,
|
||||
if True, the files are sorted descending by name.
|
||||
extensions (optional):
|
||||
Iterable of file extensions; only files with these extensions will be considered.
|
||||
|
||||
"""
|
||||
extensions = '' if extensions is None else tuple(extensions)
|
||||
file_names = [name for name in os.listdir(src_dir) if name.endswith(extensions)]
|
||||
file_names.sort(reverse=reverse)
|
||||
try:
|
||||
with open(labels, 'r') as f:
|
||||
labels = f.read().strip().split('\n')
|
||||
except TypeError:
|
||||
pass # Assume, labels is already a sequence of strings
|
||||
if not os.path.isdir(dest_dir):
|
||||
raise NotADirectoryError(f"'{dest_dir}' is not a directory.")
|
||||
if len(labels) > len(file_names):
|
||||
raise IndexError(f"There are more labels ({len(labels)}) than files "
|
||||
f"in the source directory ({len(file_names)} matching).")
|
||||
for idx, label in enumerate(labels):
|
||||
file_name = file_names[idx]
|
||||
_, ext = os.path.splitext(file_name)
|
||||
while True:
|
||||
new_path = os.path.join(dest_dir, label + ext)
|
||||
if not os.path.exists(new_path):
|
||||
shutil.copyfile(os.path.join(src_dir, file_name), new_path)
|
||||
break
|
||||
pre_label, n = os.path.splitext(label)
|
||||
try:
|
||||
n = int(n[1:])
|
||||
except ValueError:
|
||||
label = label + '.1'
|
||||
else:
|
||||
label = pre_label + '.' + str(n + 1)
|
||||
|
||||
|
||||
def load_images_data(data_dir: PathT, extensions: Iterable[str] = ('.jpg', '.png'), verbose: bool = True
|
||||
) -> tuple[dict[str, str], str]:
|
||||
"""
|
||||
Creates a dictionary mapping file paths (of images) to their labels.
|
||||
Everything up to the first dot in the filename is taken to be the label;
|
||||
this naturally excludes file extensions, but also a filename like `ABC.1.jpg` results in the label `ABC`.
|
||||
Also creates a vocabulary of characters encountered in the file names.
|
||||
|
||||
Args:
|
||||
data_dir:
|
||||
Path-like object or string to a directory containing the desired image files
|
||||
extensions (optional):
|
||||
Iterable of extensions that the files considered for the resulting data should be restricted to;
|
||||
defaults to restricting finds to JPEG and PNG files.
|
||||
verbose (optional):
|
||||
If True, the function will print out a summary of the findings before returning.
|
||||
|
||||
Returns:
|
||||
2-tuple with the first element being a dictionary where the keys are the file paths and the values are the
|
||||
file names (i.e. image labels) and the second element being a string of all characters present in the labels.
|
||||
"""
|
||||
data_dir = Path(data_dir)
|
||||
file_paths_and_labels, characters = {}, set()
|
||||
for file_path in data_dir.iterdir():
|
||||
if file_path.suffix not in extensions:
|
||||
continue
|
||||
label = file_path.name.split('.')[0]
|
||||
for char in label:
|
||||
characters.add(char)
|
||||
file_paths_and_labels[str(file_path)] = label
|
||||
if verbose:
|
||||
print("Number of images/labels found: ", len(file_paths_and_labels))
|
||||
print("Number of unique characters: ", len(characters))
|
||||
print("Characters present: ", characters)
|
||||
return file_paths_and_labels, ''.join(characters)
|
||||
|
||||
|
||||
def get_vocab_maps(characters: Iterable[str], num_oov_indices: int = 0, mask_token: str = None
|
||||
) -> tuple[layers.StringLookup, layers.StringLookup]:
|
||||
"""
|
||||
Constructs two table-based lookup objects that map characters to integers and back.
|
||||
|
||||
Details about the `StringLookup` class in the documentation:
|
||||
https://tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/StringLookup
|
||||
|
||||
Args:
|
||||
characters:
|
||||
An iterable of strings representing the vocabulary to be mapped
|
||||
num_oov_indices (optional):
|
||||
Passed to the `IndexLookup` constructor;
|
||||
defines the number of out-of-vocabulary (OOV) tokens to create;
|
||||
assuming that no OOV characters will be encountered, the default is 0.
|
||||
mask_token (optional):
|
||||
Passed to the `IndexLookup` constructor;
|
||||
the token representing missing values;
|
||||
assuming that there will never be a value missing, the default is None.
|
||||
|
||||
Returns:
|
||||
2-tuple of `StringLookup` objects, the first one mapping characters to integers, the second doing the reverse.
|
||||
By default, no OOV or missing values are assumed to be encountered,
|
||||
and thus each index (uniquely) represents a character from the vocabulary.
|
||||
"""
|
||||
char_to_int = layers.StringLookup(
|
||||
vocabulary=list(characters),
|
||||
num_oov_indices=num_oov_indices,
|
||||
mask_token=mask_token,
|
||||
)
|
||||
int_to_char = layers.StringLookup(
|
||||
vocabulary=char_to_int.get_vocabulary(),
|
||||
mask_token=mask_token,
|
||||
invert=True,
|
||||
)
|
||||
return char_to_int, int_to_char
|
||||
|
||||
|
||||
def encode_image(img):
|
||||
"""
|
||||
Creates a `Tensor` object from an image file and transposes it.
|
||||
"""
|
||||
try:
|
||||
# 0. Read image
|
||||
img = tf.io.read_file(str(img))
|
||||
except ValueError:
|
||||
pass
|
||||
# 1. Decode and convert to grayscale
|
||||
img = tf.io.decode_png(img, channels=1)
|
||||
# 2. Convert to float32 in [0, 1] range
|
||||
img = tf.image.convert_image_dtype(img, tf.float32)
|
||||
# 3. Resize to the desired size
|
||||
img = tf.image.resize(img, [CONFIG.IMG_HEIGHT, CONFIG.IMG_WIDTH])
|
||||
# 4. Transpose the image because we want the time
|
||||
# dimension to correspond to the width of the image.
|
||||
return tf.transpose(img, perm=[1, 0, 2])
|
||||
|
||||
|
||||
def encode_label(label: str, forward_lookup_table: layers.StringLookup):
|
||||
"""
|
||||
Creates a `Tensor` object from a label string by passing passing its characters through a `StringLookup` instance.
|
||||
"""
|
||||
return forward_lookup_table(tf.strings.unicode_split(label, input_encoding='UTF-8'))
|
||||
|
||||
|
||||
def decode_label(tensor, backward_lookup_table: layers.StringLookup) -> str:
|
||||
return tf.strings.reduce_join(backward_lookup_table(tensor)).numpy().decode('utf-8')
|
||||
|
||||
|
||||
def get_sample_encoder(forward_lookup_table: layers.StringLookup) -> Callable[[str, str], dict]:
|
||||
"""
|
||||
Returns a function for usage in the `map(...)` method of a `Dataset` instance.
|
||||
|
||||
The function will accept an image path and a label and return a dictionary;
|
||||
the dictionary values will be a tensor representing the image and a tensor representing the label;
|
||||
the keys for each are pre-configured and will correspond to the models input layers' names.
|
||||
|
||||
Args:
|
||||
forward_lookup_table:
|
||||
Passed to the `encode_label` function; required for mapping individual characters to floats.
|
||||
|
||||
Returns:
|
||||
Callable taking two strings as arguments and returning a dictionary with string keys and Tensor values.
|
||||
"""
|
||||
def func(img_path: PathT, label: str) -> dict:
|
||||
return {
|
||||
CONFIG.INPUT_LAYER_NAME_IMAGE: encode_image(img_path),
|
||||
CONFIG.INPUT_LAYER_NAME_LABEL: encode_label(label, forward_lookup_table)
|
||||
}
|
||||
return func
|
||||
|
||||
|
||||
def make_dataset(file_paths: np.ndarray, labels: np.ndarray,
|
||||
sample_encode_func: Callable[[str, str], dict], batch_size: int) -> tf.data.Dataset:
|
||||
"""
|
||||
Generates a `Dataset` instance from an array of image file paths and an array of corresponding labels.
|
||||
|
||||
Args:
|
||||
file_paths:
|
||||
Array of strings, each representing a path to an image file;
|
||||
each of those paths will be passed into the function encoding one data sample (as the first argument).
|
||||
labels:
|
||||
Array of strings, each representing a label for an image pointed to by a file path
|
||||
in the `file_paths` array with the corresponding index;
|
||||
each of those labels will be passed into the function encoding one data sample (as the second argument).
|
||||
sample_encode_func:
|
||||
Will be passed as the `map_func` argument into the `map(...)` method of the new `Dataset`;
|
||||
should be a function taking two strings (image path and label) as arguments and
|
||||
returning a dictionary of Tensors representing the image and label.
|
||||
batch_size:
|
||||
Will be passed as the `batch_size` argument into the `batch(...)` method of the new `Dataset`;
|
||||
determines how the dataset will be divided into batches.
|
||||
|
||||
Returns:
|
||||
A `Dataset` ready to be fed into a model's `fit(...)` method, provided that model has two input layers
|
||||
named in accordance with the keys of the dictionary returned by the `sample_encode_func` function.
|
||||
"""
|
||||
if file_paths.size != labels.size:
|
||||
raise ValueError("Number of file paths must be equal to number of labels")
|
||||
|
||||
dataset = tf.data.Dataset.from_tensor_slices((file_paths, labels))
|
||||
dataset = dataset.map(
|
||||
map_func=sample_encode_func,
|
||||
num_parallel_calls=tf.data.experimental.AUTOTUNE
|
||||
).batch(
|
||||
batch_size=batch_size
|
||||
).prefetch(
|
||||
buffer_size=tf.data.experimental.AUTOTUNE
|
||||
)
|
||||
return dataset
|
||||
|
||||
|
||||
def get_datasets(file_paths_and_labels: Mapping[str, str], sample_encode_func: Callable[[str, str], dict],
|
||||
batch_size: int, train_data_ratio: float, shuffle: bool = CONFIG.SHUFFLE_DATA
|
||||
) -> tuple[tf.data.Dataset, tf.data.Dataset]:
|
||||
"""
|
||||
Creates a training dataset and a validation dataset from a mapping of image file paths to labels.
|
||||
|
||||
Args:
|
||||
file_paths_and_labels:
|
||||
Mapping with keys being image file paths and values being labels of the corresponding images;
|
||||
this represents the full dataset used for fitting the model.
|
||||
sample_encode_func:
|
||||
Will be passed as the `sample_encode_func` argument into the `make_dataset(...)` function;
|
||||
should be a function taking two strings (image path and label) as arguments and
|
||||
returning a dictionary of Tensors representing the image and label.
|
||||
batch_size:
|
||||
Will be passed as the `batch_size` argument into the `make_dataset(...)` function;
|
||||
determines how each dataset will be divided into batches.
|
||||
train_data_ratio:
|
||||
Floating point value between 0 and 1 determining what ratio of the full dataset will be used for training;
|
||||
this implies that (1 - `train_data_ratio`) will be the ratio used for validation.
|
||||
shuffle:
|
||||
If True, the full dataset is shuffled pseudo-randomly before being split.
|
||||
|
||||
Returns:
|
||||
Two `Dataset` objects ready to be fed into a model's `fit(...)` method, provided that model has two input layers
|
||||
named in accordance with the keys of the dictionary returned by the `sample_encode_func` function.
|
||||
"""
|
||||
# 1. Get the total size of the dataset
|
||||
size = len(file_paths_and_labels)
|
||||
# 2. Make an indices array and shuffle it, if required
|
||||
indices = np.arange(size)
|
||||
if shuffle:
|
||||
np.random.shuffle(indices)
|
||||
# 3. Get the size of training samples; that will be the array cutoff index for the data-indices array
|
||||
cutoff = int(size * train_data_ratio)
|
||||
train_indices, valid_indices = indices[:cutoff], indices[cutoff:]
|
||||
# 4. Split data into training and validation sets
|
||||
file_paths, labels = np.array(list(file_paths_and_labels.keys())), np.array(list(file_paths_and_labels.values()))
|
||||
x_train, x_valid = file_paths[train_indices], file_paths[valid_indices]
|
||||
y_train, y_valid = labels[train_indices], labels[valid_indices]
|
||||
# 5. Construct the actual Dataset-class objects
|
||||
train_dataset = make_dataset(x_train, y_train, sample_encode_func, batch_size)
|
||||
valid_dataset = make_dataset(x_valid, y_valid, sample_encode_func, batch_size)
|
||||
return train_dataset, valid_dataset
|
||||
|
||||
|
||||
def plot_images(images: Sequence[np.ndarray], labels: Sequence[str] = None, num_columns: int = 4,
|
||||
transpose: bool = True) -> None:
|
||||
if transpose:
|
||||
images = tf.transpose(images, perm=[0, 2, 1, 3])
|
||||
images = images[:, :, :, 0] * 255
|
||||
images = images.numpy().astype('uint8')
|
||||
num_rows = len(images) // num_columns or 1
|
||||
_, axs = plt.subplots(num_rows, num_columns, figsize=(10, 5))
|
||||
for idx, image in enumerate(images):
|
||||
if num_rows == 1:
|
||||
if num_columns == 1:
|
||||
ax = axs
|
||||
else:
|
||||
ax = axs[idx // num_columns]
|
||||
else:
|
||||
ax = axs[idx // num_columns, idx % num_columns]
|
||||
ax.imshow(image, cmap='gray')
|
||||
if labels is not None:
|
||||
ax.set_title(labels[idx])
|
||||
ax.axis('off')
|
||||
plt.show()
|
||||
|
||||
|
||||
class DatasetsInterface:
|
||||
"""
|
||||
Convenience class for loading and pre-processing the training and validation data for usage with a model.
|
||||
"""
|
||||
|
||||
def __init__(self, batch_size: int, data_dir: PathT,
|
||||
extensions: Iterable[str] = CONFIG.DEFAULT_DATA_FILE_EXTENSIONS) -> None:
|
||||
self.file_paths_and_labels, self.characters = load_images_data(data_dir=data_dir, extensions=extensions)
|
||||
self.forward_lookup_table, self.backward_lookup_table = get_vocab_maps(characters=self.characters)
|
||||
self.sample_encode_func = get_sample_encoder(forward_lookup_table=self.forward_lookup_table)
|
||||
self.batch_size = batch_size
|
||||
self.training, self.validation = None, None
|
||||
|
||||
def split_and_make_datasets(self, train_data_ratio: float = (1 - CONFIG.VALIDATION_DATA_RATIO),
|
||||
shuffle: bool = CONFIG.SHUFFLE_DATA) -> None:
|
||||
self.training, self.validation = get_datasets(file_paths_and_labels=self.file_paths_and_labels,
|
||||
sample_encode_func=self.sample_encode_func,
|
||||
batch_size=self.batch_size,
|
||||
train_data_ratio=train_data_ratio,
|
||||
shuffle=shuffle)
|
221
src/ccaptchas/preprocess.py
Normal file
221
src/ccaptchas/preprocess.py
Normal file
@ -0,0 +1,221 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
from .config import CONFIG
|
||||
from .keras.layers import StringLookup
|
||||
from .types import PathT, SampleEncFuncT, ImgT
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
UTF8 = 'UTF-8'
|
||||
IMG_DECODE_MAP = {
|
||||
CONFIG.EXT_PNG: tf.image.decode_png,
|
||||
CONFIG.EXT_JPG: tf.image.decode_jpeg,
|
||||
}
|
||||
|
||||
|
||||
def find_image_files(data_dir: PathT, file_ext: Iterable[str] = CONFIG.DEFAULT_IMG_FILE_EXT) -> list[Path]:
|
||||
data_dir = Path(data_dir)
|
||||
if not data_dir.is_dir():
|
||||
raise NotADirectoryError
|
||||
log.debug("Finding labeled image files in directory '%s'", str(data_dir))
|
||||
img_paths = []
|
||||
for ext in file_ext:
|
||||
if not ext.startswith('.'):
|
||||
ext = f'.{ext}'
|
||||
img_paths.extend(path for path in data_dir.glob(f'*{ext}') if path.is_file())
|
||||
log.info("Found %d image files", len(img_paths))
|
||||
return img_paths
|
||||
|
||||
|
||||
def get_all_characters(img_paths: Iterable[Path]) -> str:
|
||||
characters = set()
|
||||
for path in img_paths:
|
||||
characters.update(path.stem)
|
||||
characters = ''.join(characters)
|
||||
log.info("Identified %d distinct characters in the file labels: '%s'", len(characters), characters)
|
||||
return characters
|
||||
|
||||
|
||||
def get_lookup_table(vocabulary: Iterable[str], invert: bool = False, **kwargs) -> StringLookup:
|
||||
"""
|
||||
Constructs a string lookup table mapping characters to integers or vice-versa.
|
||||
|
||||
Details about the `StringLookup` class in the documentation:
|
||||
https://keras.io/api/layers/preprocessing_layers/categorical/string_lookup/
|
||||
|
||||
Args:
|
||||
vocabulary:
|
||||
An iterable of strings representing the vocabulary to be mapped
|
||||
invert (optional):
|
||||
If `True`, a backward lookup table is returned, which maps indices to characters. Otherwise a forward
|
||||
lookup table is returned mapping characters to indices. Defaults to `False`.
|
||||
**kwargs (optional):
|
||||
Other keyword arguments to pass into the `StringLookup` constructors.
|
||||
Defaults for `num_oov_indices` and `mask_token` are defined in the package config.
|
||||
|
||||
Returns:
|
||||
`StringLookup` object with the specified properties.
|
||||
"""
|
||||
kwargs.setdefault('num_oov_indices', CONFIG.DEFAULT_NUM_OOV_INDICES)
|
||||
kwargs.setdefault('mask_token', CONFIG.DEFAULT_MASK_TOKEN)
|
||||
if isinstance(vocabulary, str):
|
||||
vocabulary = list(vocabulary)
|
||||
return StringLookup(vocabulary=vocabulary, invert=invert, **kwargs)
|
||||
|
||||
|
||||
def get_vocab_maps(characters: str, **kwargs) -> tuple[StringLookup, StringLookup]:
|
||||
"""
|
||||
Constructs two table-based lookup objects that map characters to integers and back.
|
||||
|
||||
See `get_lookup_table` for details.
|
||||
|
||||
Args:
|
||||
characters:
|
||||
A string of all characters in the vocabulary to be mapped; the characters should all be distinct.
|
||||
**kwargs (optional):
|
||||
Keyword arguments to pass into both `StringLookup` constructors.
|
||||
Must not contain the `invert` and `vocabulary` keywords.
|
||||
Defaults for `num_oov_indices` and `mask_token` are defined in the package config.
|
||||
|
||||
Returns:
|
||||
2-tuple of `StringLookup` objects, the first one mapping characters to integers, the second doing the reverse.
|
||||
"""
|
||||
char_to_int = get_lookup_table(characters, invert=False, **kwargs)
|
||||
int_to_char = get_lookup_table(char_to_int.get_vocabulary(), invert=True, **kwargs)
|
||||
log.info("Constructed vocabulary lookup tables")
|
||||
return char_to_int, int_to_char
|
||||
|
||||
|
||||
def split_data(img_paths: Iterable[Path], validation_ratio: float = CONFIG.DEFAULT_VALIDATION_RATIO,
|
||||
shuffle: bool = CONFIG.DEFAULT_SHUFFLE_DATA) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Splits an iterable of image paths into two arrays of training and validation data.
|
||||
|
||||
Args:
|
||||
img_paths:
|
||||
Iterable of paths to the image files to be used for training and validation.
|
||||
validation_ratio:
|
||||
Float between 0 and 1 determining what ratio of the full dataset will be used for validation;
|
||||
this implies that (1 - `validation_ratio`) will be the ratio used for training.
|
||||
shuffle:
|
||||
If True, the full dataset is shuffled pseudo-randomly before being split.
|
||||
|
||||
Returns:
|
||||
2-tuple of 2D numpy arrays, the first representing the training data and the second representing the validation
|
||||
data. The first dimension of each array is the dataset dimension (i.e. the samples) and the second contains the
|
||||
path (as a string) at index 0 and the label for each image at index 1.
|
||||
"""
|
||||
if not 0 < validation_ratio < 1:
|
||||
raise ValueError
|
||||
paths_and_labels = np.array(tuple((str(path), path.stem) for path in img_paths))
|
||||
# 1. Get the total size of the dataset
|
||||
size = len(paths_and_labels)
|
||||
cutoff = int(size * (1 - validation_ratio))
|
||||
# 2. Make an indices array and shuffle it, if required
|
||||
indices = np.arange(size)
|
||||
if shuffle:
|
||||
np.random.shuffle(indices)
|
||||
# 4. Split data into training and validation sets
|
||||
training_data, validation_data = paths_and_labels[indices[:cutoff]], paths_and_labels[indices[cutoff:]]
|
||||
log.info("Split data into %d images for training and %d for validation", len(training_data), len(validation_data))
|
||||
return training_data, validation_data
|
||||
|
||||
|
||||
def process_image(img: ImgT, img_width: int = CONFIG.DEFAULT_IMG_WIDTH,
|
||||
img_height: int = CONFIG.DEFAULT_IMG_HEIGHT) -> tf.Tensor:
|
||||
# 0. Read image
|
||||
if isinstance(img, (str, Path)):
|
||||
img = tf.io.read_file(str(img))
|
||||
# 1. Decode and convert to grayscale
|
||||
img = tf.io.decode_image(img, channels=1, expand_animations=False)
|
||||
# img = tf.io.decode_jpeg(img, channels=1)
|
||||
# 2. Convert to float32 in [0, 1] range
|
||||
img = tf.image.convert_image_dtype(img, tf.float32)
|
||||
# 3. Resize to the desired size
|
||||
img = tf.image.resize(img, [img_height, img_width])
|
||||
# 4. Transpose the image because we want the time
|
||||
# dimension to correspond to the width of the image.
|
||||
return tf.transpose(img, perm=[1, 0, 2])
|
||||
|
||||
|
||||
def encode_label(label: str, forward_lookup: StringLookup):
|
||||
"""
|
||||
Creates a `Tensor` object from a label string by passing passing its characters through a `StringLookup` instance.
|
||||
"""
|
||||
return forward_lookup(tf.strings.unicode_split(label, input_encoding=UTF8))
|
||||
|
||||
|
||||
def decode_label(tensor: tf.Tensor, backward_lookup: StringLookup) -> str:
|
||||
return tf.strings.reduce_join(backward_lookup(tensor)).numpy().decode(UTF8)
|
||||
|
||||
|
||||
def get_sample_encode_func(forward_lookup: StringLookup, img_width: int = CONFIG.DEFAULT_IMG_WIDTH,
|
||||
img_height: int = CONFIG.DEFAULT_IMG_HEIGHT) -> SampleEncFuncT:
|
||||
def encode_sample(img_path: PathT, label: str) -> dict[str, tf.Tensor]:
|
||||
log.debug("Encoding image '%s'", str(img_path))
|
||||
img = process_image(tf.io.read_file(img_path), img_width=img_width, img_height=img_height)
|
||||
label = encode_label(label, forward_lookup)
|
||||
# Return a dict as our model is expecting two inputs
|
||||
return {CONFIG.LAYER_NAME_INPUT_IMAGE: img, CONFIG.LAYER_NAME_INPUT_LABEL: label}
|
||||
return encode_sample
|
||||
|
||||
|
||||
def make_dataset(data: np.ndarray, sample_encode_func: SampleEncFuncT, batch_size: int) -> tf.data.Dataset:
|
||||
"""
|
||||
Generates a `Dataset` instance from an array of image file paths and an array of corresponding labels.
|
||||
|
||||
Args:
|
||||
data:
|
||||
A 2D numpy array representing the data and labels to turn into a dataset for training/validation.
|
||||
The first dimension of the array is the dataset dimension (i.e. the samples) and the second contains the
|
||||
path (as a string) at index 0 and the label for each image at index 1.
|
||||
Each path-label-pair will be passed into the `sample_encode_func` during construction of the dataset as
|
||||
the only two positional arguments.
|
||||
sample_encode_func:
|
||||
Will be passed as the `map_func` argument into the `map(...)` method of the new `Dataset`;
|
||||
should be a function taking two strings (image path and label) as arguments and
|
||||
returning a dictionary of Tensors representing the image and label.
|
||||
batch_size:
|
||||
Will be passed as the `batch_size` argument into the `batch(...)` method of the new `Dataset`;
|
||||
determines how the dataset will be divided into batches.
|
||||
|
||||
Returns:
|
||||
A `Dataset` ready to be fed into a model's `fit(...)` method, provided that model has two input layers
|
||||
named in accordance with the keys of the dictionary returned by the `sample_encode_func` function.
|
||||
"""
|
||||
log.info("Constructing dataset of %d samples, split into batches of %d", len(data), batch_size)
|
||||
dataset = tf.data.Dataset.from_tensor_slices((data[:, 0], data[:, 1]))
|
||||
dataset = dataset.map(
|
||||
map_func=sample_encode_func,
|
||||
num_parallel_calls=tf.data.experimental.AUTOTUNE
|
||||
).batch(
|
||||
batch_size=batch_size
|
||||
).prefetch(
|
||||
buffer_size=tf.data.experimental.AUTOTUNE
|
||||
)
|
||||
return dataset
|
||||
|
||||
|
||||
def load_datasets(data_dir: PathT,
|
||||
file_ext: Iterable[str] = CONFIG.DEFAULT_IMG_FILE_EXT,
|
||||
batch_size: int = CONFIG.DEFAULT_BATCH_SIZE,
|
||||
validation_ratio: float = CONFIG.DEFAULT_VALIDATION_RATIO,
|
||||
shuffle: bool = CONFIG.DEFAULT_SHUFFLE_DATA,
|
||||
img_width: int = CONFIG.DEFAULT_IMG_WIDTH,
|
||||
img_height: int = CONFIG.DEFAULT_IMG_HEIGHT) -> tuple[tf.data.Dataset, tf.data.Dataset, str]:
|
||||
log.info("Constructing datasets")
|
||||
img_paths = find_image_files(data_dir, file_ext=file_ext)
|
||||
characters = get_all_characters(img_paths)
|
||||
forward_lookup, _ = get_vocab_maps(characters)
|
||||
arr_train, arr_valid = split_data(img_paths, validation_ratio=validation_ratio, shuffle=shuffle)
|
||||
encode_func = get_sample_encode_func(forward_lookup, img_width=img_width, img_height=img_height)
|
||||
ds_train = make_dataset(arr_train, encode_func, batch_size=batch_size)
|
||||
ds_valid = make_dataset(arr_valid, encode_func, batch_size=batch_size)
|
||||
assert characters == ''.join(forward_lookup.get_vocabulary())
|
||||
return ds_train, ds_valid, characters
|
@ -1,5 +1,11 @@
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
from typing import Callable, Union
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
PathT = Union[Path, str]
|
||||
SampleEncFuncT = Callable[[PathT, str], dict[str, tf.Tensor]]
|
||||
ImgT = Union[PathT, bytes]
|
||||
Array = np.ndarray
|
||||
|
28
src/ccaptchas/visualize.py
Normal file
28
src/ccaptchas/visualize.py
Normal file
@ -0,0 +1,28 @@
|
||||
from typing import Sequence
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def plot_images(images: Sequence[np.ndarray], labels: Sequence[str] = None, num_columns: int = 4,
|
||||
transpose: bool = True) -> None:
|
||||
if transpose:
|
||||
images = tf.transpose(images, perm=[0, 2, 1, 3])
|
||||
images = images[:, :, :, 0] * 255
|
||||
images = images.numpy().astype('uint8')
|
||||
num_rows = len(images) // num_columns or 1
|
||||
_, axs = plt.subplots(num_rows, num_columns, figsize=(10, 5))
|
||||
for idx, image in enumerate(images):
|
||||
if num_rows == 1:
|
||||
if num_columns == 1:
|
||||
ax = axs
|
||||
else:
|
||||
ax = axs[idx // num_columns]
|
||||
else:
|
||||
ax = axs[idx // num_columns, idx % num_columns]
|
||||
ax.imshow(image, cmap='gray')
|
||||
if labels is not None:
|
||||
ax.set_title(labels[idx])
|
||||
ax.axis('off')
|
||||
plt.show()
|
Loading…
Reference in New Issue
Block a user