Initial commit.
Signed-off-by: Slendi <slendi@socopon.com>
This commit is contained in:
commit
b8acb45e68
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
venv
|
||||
train.json
|
||||
dataset
|
||||
final_model
|
||||
|
19
create_dataset.sh
Executable file
19
create_dataset.sh
Executable file
@ -0,0 +1,19 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -xe
|
||||
|
||||
python3 dataset_maker.py
|
||||
uwuify -t $(nproc) messages_to_be_uwuified.txt messages_uwuified.txt
|
||||
|
||||
rm -f messages.txt messages_to_be_uwuified.txt
|
||||
|
||||
rm -rf dataset
|
||||
mkdir -p dataset/{normal,uwu}
|
||||
#mv messages_good.txt dataset/normal/normal_text_1.txt
|
||||
#mv messages_uwuified.txt dataset/uwu/uwu_text_1.txt
|
||||
|
||||
set +x
|
||||
python3 split_file.py messages_good.txt dataset/normal $(nproc)
|
||||
python3 split_file.py messages_uwuified.txt dataset/uwu $(nproc)
|
||||
|
||||
rm messages_good.txt messages_uwuified.txt
|
48
dataset_maker.py
Normal file
48
dataset_maker.py
Normal file
@ -0,0 +1,48 @@
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
|
||||
input_file_path = "train.json"
|
||||
|
||||
output_file_path = "messages.txt"
|
||||
|
||||
if not os.path.exists(input_file_path):
|
||||
print('Downloading Topical Chat dataset')
|
||||
url = "https://raw.githubusercontent.com/alexa/Topical-Chat/master/conversations/train.json"
|
||||
response = requests.get(url)
|
||||
with open(input_file_path, 'wb') as file:
|
||||
file.write(response.content)
|
||||
|
||||
with open(input_file_path, 'r') as json_file, open(output_file_path, 'w') as output_file:
|
||||
data = json.load(json_file)
|
||||
|
||||
for key, value in data.items():
|
||||
if "content" in value:
|
||||
for message_item in value["content"]:
|
||||
if "message" in message_item:
|
||||
message = message_item["message"]
|
||||
output_file.write(message + '\n')
|
||||
|
||||
print("Messages extracted and saved to", output_file_path)
|
||||
|
||||
# Split 50/50
|
||||
input_file_path = "messages.txt"
|
||||
|
||||
output_file_path_1 = "messages_to_be_uwuified.txt"
|
||||
output_file_path_2 = "messages_good.txt"
|
||||
|
||||
with open(input_file_path, 'r') as input_file:
|
||||
messages = input_file.readlines()
|
||||
|
||||
split_point = len(messages) // 2
|
||||
|
||||
messages_split_1 = messages[:split_point]
|
||||
messages_split_2 = messages[split_point:]
|
||||
|
||||
with open(output_file_path_1, 'w') as output_file_1:
|
||||
output_file_1.writelines(messages_split_1)
|
||||
|
||||
with open(output_file_path_2, 'w') as output_file_2:
|
||||
output_file_2.writelines(messages_split_2)
|
||||
|
||||
print("Messages split into two files:", output_file_path_1, "and", output_file_path_2)
|
15
interactive.py
Executable file
15
interactive.py
Executable file
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
@tf.keras.utils.register_keras_serializable(package='Custom', name=None)
|
||||
def text_standardizer(input_data):
|
||||
lowercase = tf.strings.lower(input_data)
|
||||
return lowercase
|
||||
|
||||
with tf.keras.utils.CustomObjectScope({'text_standardizer': text_standardizer}):
|
||||
model = tf.keras.models.load_model('final_model')
|
||||
model.summary()
|
||||
|
||||
while True:
|
||||
print(model.predict([input('> ')]))
|
43
split_file.py
Executable file
43
split_file.py
Executable file
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import sys
|
||||
import multiprocessing
|
||||
|
||||
def split_lines(input_file, output_dir, start, end):
|
||||
with open(input_file, 'r') as input_text_file:
|
||||
lines = input_text_file.readlines()[start:end]
|
||||
|
||||
for index, line in enumerate(lines):
|
||||
output_file = os.path.join(output_dir, f'text_{start + index}.txt')
|
||||
with open(output_file, 'w') as output_text_file:
|
||||
output_text_file.write(line)
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 4:
|
||||
print("Usage: python split_file.py input_file.txt output_directory nprocs")
|
||||
else:
|
||||
input_file = sys.argv[1]
|
||||
output_dir = sys.argv[2]
|
||||
nprocs = int(sys.argv[3])
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
with open(input_file, 'r') as input_text_file:
|
||||
lines = input_text_file.readlines()
|
||||
|
||||
chunk_size = len(lines) // nprocs
|
||||
processes = []
|
||||
|
||||
for i in range(nprocs):
|
||||
start = i * chunk_size
|
||||
end = start + chunk_size if i < nprocs - 1 else len(lines)
|
||||
process = multiprocessing.Process(target=split_lines, args=(input_file, output_dir, start, end))
|
||||
process.start()
|
||||
processes.append(process)
|
||||
|
||||
for process in processes:
|
||||
process.join()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
145
train.py
Executable file
145
train.py
Executable file
@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import string
|
||||
import tensorflow as tf
|
||||
|
||||
from tensorflow.keras.saving import register_keras_serializable
|
||||
from tensorflow.keras import layers
|
||||
from tensorflow.keras import losses
|
||||
|
||||
print(tf.__version__)
|
||||
|
||||
BATCH_SIZE = 32
|
||||
SEED = 69420
|
||||
|
||||
print('Loading training dataset')
|
||||
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
|
||||
'dataset',
|
||||
batch_size=BATCH_SIZE,
|
||||
subset='training',
|
||||
seed=SEED,
|
||||
label_mode="int",
|
||||
class_names=['normal', 'uwu'],
|
||||
validation_split=0.2,
|
||||
)
|
||||
|
||||
print('Loading validation dataset')
|
||||
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
|
||||
'dataset',
|
||||
batch_size=BATCH_SIZE,
|
||||
subset='validation',
|
||||
seed=SEED,
|
||||
label_mode="int",
|
||||
class_names=['normal', 'uwu'],
|
||||
validation_split=0.2,
|
||||
)
|
||||
|
||||
print('Loading testing dataset')
|
||||
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
|
||||
'dataset',
|
||||
batch_size=BATCH_SIZE,
|
||||
label_mode="int",
|
||||
class_names=['normal', 'uwu'],
|
||||
)
|
||||
|
||||
@tf.keras.utils.register_keras_serializable(package='Custom', name=None)
|
||||
def text_standardizer(input_data):
|
||||
lowercase = tf.strings.lower(input_data)
|
||||
return lowercase
|
||||
|
||||
MAX_FEATURES = 10000
|
||||
SEQUENCE_LENGTH = 240
|
||||
|
||||
vectorize_layer = layers.TextVectorization(
|
||||
standardize=text_standardizer,
|
||||
max_tokens=MAX_FEATURES,
|
||||
output_mode='int',
|
||||
output_sequence_length=SEQUENCE_LENGTH
|
||||
)
|
||||
|
||||
train_text = raw_train_ds.map(lambda x, y: x)
|
||||
vectorize_layer.adapt(train_text)
|
||||
|
||||
def vectorize_text(text, label):
|
||||
text = tf.expand_dims(text, -1)
|
||||
return vectorize_layer(text), label
|
||||
|
||||
#text_batch, label_batch = next(iter(raw_train_ds))
|
||||
#first_message, first_label = text_batch[0], label_batch[0]
|
||||
#print('Message', first_message)
|
||||
#print('Label', first_label)
|
||||
#print('Vectorized message', vectorize_text(first_message, first_label))
|
||||
|
||||
train_ds = raw_train_ds.map(vectorize_text)
|
||||
val_ds = raw_val_ds.map(vectorize_text)
|
||||
test_ds = raw_test_ds.map(vectorize_text)
|
||||
|
||||
AUTOTUNE = tf.data.AUTOTUNE
|
||||
|
||||
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
|
||||
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
|
||||
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
|
||||
|
||||
print('Creating model')
|
||||
|
||||
EMBEDDING_DIM = 16
|
||||
|
||||
model = tf.keras.Sequential([
|
||||
layers.Embedding(MAX_FEATURES, EMBEDDING_DIM),
|
||||
layers.Dropout(0.2),
|
||||
layers.GlobalAveragePooling1D(),
|
||||
layers.Dropout(0.2),
|
||||
layers.Dense(1)])
|
||||
|
||||
model.summary()
|
||||
|
||||
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
|
||||
optimizer='adam',
|
||||
metrics=tf.metrics.BinaryAccuracy(threshold=0.0))
|
||||
|
||||
epochs = 10
|
||||
history = model.fit(
|
||||
train_ds,
|
||||
validation_data=val_ds,
|
||||
epochs=epochs)
|
||||
|
||||
loss, accuracy = model.evaluate(test_ds)
|
||||
|
||||
print("Loss: ", loss)
|
||||
print("Accuracy: ", accuracy)
|
||||
|
||||
history_dict = history.history
|
||||
history_dict.keys()
|
||||
|
||||
acc = history_dict['binary_accuracy']
|
||||
val_acc = history_dict['val_binary_accuracy']
|
||||
loss = history_dict['loss']
|
||||
val_loss = history_dict['val_loss']
|
||||
|
||||
epochs = range(1, len(acc) + 1)
|
||||
|
||||
print('Exporting model')
|
||||
|
||||
export_model = tf.keras.Sequential([
|
||||
vectorize_layer,
|
||||
model,
|
||||
layers.Activation('sigmoid')
|
||||
])
|
||||
|
||||
export_model.compile(
|
||||
loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
|
||||
)
|
||||
|
||||
# Test it with `raw_test_ds`, which yields raw strings
|
||||
loss, accuracy = export_model.evaluate(raw_test_ds)
|
||||
print(accuracy)
|
||||
|
||||
print('Saving model')
|
||||
export_model.save('final_model', save_format='tf')
|
||||
|
||||
while True:
|
||||
export_model.predict([input('> ')])
|
Loading…
x
Reference in New Issue
Block a user