from tensorbay import GAS
from tensorbay.dataset import Segment
import numpy as np
import tensorflow as tf
from tensorflow import keras as tfk
import math
import pnlp
from pnlp import Dict
from tensorflow.data import Dataset
from transformers import AutoTokenizer
from tensorbay.dataset import Dataset as TensorBayDataset
from tensorbay.opendataset import Newsgroups20
from typing import Union
token = "Accesskey-098e0c26fdc79f31a085f5b897052ba4"
seg = "20news-18828"
seg = "20news-bydate-train"
gas = GAS(token)
dataset_client = gas.get_dataset("Newsgroups20")
segments = dataset_client.list_segment_names()
segment = Segment(seg, dataset_client)
ele = segment[0]
ds = TensorBayDataset("Newsgroups20", gas)
ds.keys()
ds.catalog.classification.categories
dataset = Newsgroups20("./data/")
seg = dataset["20news-18828"]
x = seg[0]
x.label
x.open().read()[:5]
i = 0
for v in seg:
i += 1
pass
i
class NewsGroupSegment:
def __init__(
self,
client: Union[str, GAS],
segment_name: str,
tokenizer_path: str,
label_file: str,
max_length: int = 512
):
if isinstance(client, GAS):
self.dataset = TensorBayDataset("Newsgroups20", client)
elif isinstance(client, str):
self.dataset = Newsgroups20(client)
else:
raise ValueError("Invalid dataset client")
self.segment = self.dataset[segment_name]
self.max_length = max_length
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
labels = pnlp.read_lines(label_file)
self.category_to_index = dict(zip(labels, range(len(labels))))
def __call__(self):
for data in self.segment:
with data.open() as fp:
txt = fp.read().decode("utf8", errors="ignore")
ids = self.tokenizer.encode(
txt, max_length=self.max_length, truncation=True, padding="max_length"
)
input_tensor = tf.convert_to_tensor(np.array(ids), dtype=tf.int32)
category = self.category_to_index[data.label.classification.category]
category_tensor = tf.convert_to_tensor(category, dtype=tf.int32)
yield input_tensor, category_tensor
def text_cnn(config, inputs):
embed = tfk.layers.Embedding(config.vocab_size, config.embed_size,
embeddings_initializer=tfk.initializers.RandomUniform(minval=-1, maxval=1),
input_length=config.max_len,
name='embedding')(inputs)
embed = tfk.layers.Reshape((config.max_len, config.embed_size, 1), name='add_channel')(embed)
pool_outputs = []
for filter_size in list(map(int, config.filter_sizes.split(','))):
conv = tfk.layers.Conv2D(config.num_filters,
(filter_size, config.embed_size),
strides=(1, 1),
padding='valid',
data_format='channels_last',
activation='relu',
kernel_initializer='glorot_normal',
bias_initializer=tfk.initializers.constant(0.1),
name='convolution_{:d}'.format(filter_size)
)(embed)
pool = tfk.layers.MaxPool2D(pool_size=(config.max_len - filter_size + 1, 1),
strides=(1, 1), padding='valid',
data_format='channels_last',
name='max_pooling_{:d}'.format(filter_size))(conv)
pool_outputs.append(pool)
z = tfk.layers.concatenate(pool_outputs, axis=-1, name='concatenate')
z = tfk.layers.Flatten(data_format='channels_last', name='flatten')(z)
z = tfk.layers.Dropout(config.dropout, name='dropout')(z)
return z
def build_model(config, module):
inputs = tfk.Input(shape=(config.max_len, ), name='input_data')
z = module(config, inputs)
outputs = tfk.layers.Dense(config.num_classes, activation='softmax',
kernel_initializer='glorot_normal',
bias_initializer=tfk.initializers.constant(0.1),
kernel_regularizer=tfk.regularizers.l2(config.regularizers_lambda),
bias_regularizer=tfk.regularizers.l2(config.regularizers_lambda),
name='dense')(z)
model = tfk.Model(inputs=inputs, outputs=outputs)
return model
config = Dict({
"vocab_size": 21128,
"embed_size": 256,
"max_len": 512,
"num_filters": 128,
"filter_sizes": "2,3,4",
"dropout": 0.1,
"regularizers_lambda": 0.01,
"num_classes": 20
})
max_len = 512
batch_size = 32
segment_name = "20news-18828"
client = "./data/" # GAS(token)
data = NewsGroupSegment(client, segment_name, "./bert/", "labels.txt", max_len)
epochs = 5
steps_per_epoch = math.ceil(len(data.segment) / batch_size)
dataset = Dataset.from_generator(
data,
output_signature=(
tf.TensorSpec(shape=(max_len, ), dtype=tf.float32),
tf.TensorSpec(shape=(), dtype=tf.int32),
),
).shuffle(buffer_size=len(data.segment), reshuffle_each_iteration=True).batch(batch_size).repeat(epochs)
model = build_model(config, text_cnn)
model.compile(
optimizer=tfk.optimizers.Adamax(learning_rate=1e-3),
loss=tfk.losses.SparseCategoricalCrossentropy(),
metrics=[tfk.metrics.SparseCategoricalAccuracy()],
)
model.fit(dataset, epochs=epochs, steps_per_epoch=steps_per_epoch)
Epoch 1/5
一些使用中的问题: