# Load PyTorch library !pip3 install torch import torch import torch.nn as nn import torch.nn.functional as F batch_size = 5 seq_size = 10 # max length per input (masking will be used for sequences that aren't this max length) x_lengths = [8, 5, 4, 10, 5] # lengths of each input sequence embedding_dim = 100 rnn_hidden_dim = 256 output_dim = 4 # Initialize synthetic inputs x_in = torch.randn(batch_size, seq_size, embedding_dim) x_lengths = torch.tensor(x_lengths) print (x_in.size()) # Initialize hidden state hidden_t = torch.zeros((batch_size, rnn_hidden_dim)) print (hidden_t.size()) # Initialize RNN cell rnn_cell = nn.RNNCell(embedding_dim, rnn_hidden_dim) print (rnn_cell) # Forward pass through RNN x_in = x_in.permute(1, 0, 2) # RNN needs batch_size to be at dim 1 # Loop through the inputs time steps hiddens = [] for t in range(seq_size): hidden_t = rnn_cell(x_in[t], hidden_t) hiddens.append(hidden_t) hiddens = torch.stack(hiddens) hiddens = hiddens.permute(1, 0, 2) # bring batch_size back to dim 0 print (hiddens.size()) # We also could've used a more abstracted layer x_in = torch.randn(batch_size, seq_size, embedding_dim) rnn = nn.RNN(embedding_dim, rnn_hidden_dim, batch_first=True) out, h_n = rnn(x_in) #h_n is the last hidden state print ("out: ", out.size()) print ("h_n: ", h_n.size()) def gather_last_relevant_hidden(hiddens, x_lengths): x_lengths = x_lengths.long().detach().cpu().numpy() - 1 out = [] for batch_index, column_index in enumerate(x_lengths): out.append(hiddens[batch_index, column_index]) return torch.stack(out) # Gather the last relevant hidden state z = gather_last_relevant_hidden(hiddens, x_lengths) print (z.size()) # Forward pass through FC layer fc1 = nn.Linear(rnn_hidden_dim, output_dim) y_pred = fc1(z) y_pred = F.softmax(y_pred, dim=1) print (y_pred.size()) print (y_pred) # GRU in PyTorch gru = nn.GRU(input_size=embedding_dim, hidden_size=rnn_hidden_dim, batch_first=True) # Initialize synthetic input x_in = torch.randn(batch_size, seq_size, embedding_dim) print (x_in.size()) # Forward pass out, h_n = gru(x_in) print ("out:", out.size()) print ("h_n:", h_n.size()) # BiGRU in PyTorch bi_gru = nn.GRU(input_size=embedding_dim, hidden_size=rnn_hidden_dim, batch_first=True, bidirectional=True) # Forward pass out, h_n = bi_gru(x_in) print ("out:", out.size()) # collection of all hidden states from the RNN for each time step print ("h_n:", h_n.size()) # last hidden state from the RNN import os from argparse import Namespace import collections import copy import json import matplotlib.pyplot as plt import numpy as np import pandas as pd import re import torch # Set Numpy and PyTorch seeds def set_seeds(seed, cuda): np.random.seed(seed) torch.manual_seed(seed) if cuda: torch.cuda.manual_seed_all(seed) # Creating directories def create_dirs(dirpath): if not os.path.exists(dirpath): os.makedirs(dirpath) # Arguments args = Namespace( seed=1234, cuda=True, shuffle=True, data_file="news.csv", split_data_file="split_news.csv", vectorizer_file="vectorizer.json", model_state_file="model.pth", save_dir="news", train_size=0.7, val_size=0.15, test_size=0.15, pretrained_embeddings=None, cutoff=25, # token must appear at least times to be in SequenceVocabulary num_epochs=5, early_stopping_criteria=5, learning_rate=1e-3, batch_size=64, embedding_dim=100, rnn_hidden_dim=128, hidden_dim=100, num_layers=1, bidirectional=False, dropout_p=0.1, ) # Set seeds set_seeds(seed=args.seed, cuda=args.cuda) # Create save dir create_dirs(args.save_dir) # Expand filepaths args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file) args.model_state_file = os.path.join(args.save_dir, args.model_state_file) # Check CUDA if not torch.cuda.is_available(): args.cuda = False args.device = torch.device("cuda" if args.cuda else "cpu") print("Using CUDA: {}".format(args.cuda)) import re import urllib # Upload data from GitHub to notebook's local drive url = "https://raw.githubusercontent.com/LisonEvf/practicalAI-cn/master/data/news.csv" response = urllib.request.urlopen(url) html = response.read() with open(args.data_file, 'wb') as fp: fp.write(html) # Raw data df = pd.read_csv(args.data_file, header=0) df.head() # Split by category by_category = collections.defaultdict(list) for _, row in df.iterrows(): by_category[row.category].append(row.to_dict()) for category in by_category: print ("{0}: {1}".format(category, len(by_category[category]))) # Create split data final_list = [] for _, item_list in sorted(by_category.items()): if args.shuffle: np.random.shuffle(item_list) n = len(item_list) n_train = int(args.train_size*n) n_val = int(args.val_size*n) n_test = int(args.test_size*n) # Give data point a split attribute for item in item_list[:n_train]: item['split'] = 'train' for item in item_list[n_train:n_train+n_val]: item['split'] = 'val' for item in item_list[n_train+n_val:]: item['split'] = 'test' # Add to final list final_list.extend(item_list) # df with split datasets split_df = pd.DataFrame(final_list) split_df["split"].value_counts() # Preprocessing def preprocess_text(text): text = ' '.join(word.lower() for word in text.split(" ")) text = re.sub(r"([.,!?])", r" \1 ", text) text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text) text = text.strip() return text split_df.title = split_df.title.apply(preprocess_text) # Save to CSV split_df.to_csv(args.split_data_file, index=False) split_df.head() class Vocabulary(object): def __init__(self, token_to_idx=None): # Token to index if token_to_idx is None: token_to_idx = {} self.token_to_idx = token_to_idx # Index to token self.idx_to_token = {idx: token \ for token, idx in self.token_to_idx.items()} def to_serializable(self): return {'token_to_idx': self.token_to_idx} @classmethod def from_serializable(cls, contents): return cls(**contents) def add_token(self, token): if token in self.token_to_idx: index = self.token_to_idx[token] else: index = len(self.token_to_idx) self.token_to_idx[token] = index self.idx_to_token[index] = token return index def add_tokens(self, tokens): return [self.add_token[token] for token in tokens] def lookup_token(self, token): return self.token_to_idx[token] def lookup_index(self, index): if index not in self.idx_to_token: raise KeyError("the index (%d) is not in the Vocabulary" % index) return self.idx_to_token[index] def __str__(self): return "" % len(self) def __len__(self): return len(self.token_to_idx) # Vocabulary instance category_vocab = Vocabulary() for index, row in df.iterrows(): category_vocab.add_token(row.category) print (category_vocab) # __str__ print (len(category_vocab)) # __len__ index = category_vocab.lookup_token("Business") print (index) print (category_vocab.lookup_index(index)) from collections import Counter import string class SequenceVocabulary(Vocabulary): def __init__(self, token_to_idx=None, unk_token="", mask_token="", begin_seq_token="", end_seq_token=""): super(SequenceVocabulary, self).__init__(token_to_idx) self.mask_token = mask_token self.unk_token = unk_token self.begin_seq_token = begin_seq_token self.end_seq_token = end_seq_token self.mask_index = self.add_token(self.mask_token) self.unk_index = self.add_token(self.unk_token) self.begin_seq_index = self.add_token(self.begin_seq_token) self.end_seq_index = self.add_token(self.end_seq_token) # Index to token self.idx_to_token = {idx: token \ for token, idx in self.token_to_idx.items()} def to_serializable(self): contents = super(SequenceVocabulary, self).to_serializable() contents.update({'unk_token': self.unk_token, 'mask_token': self.mask_token, 'begin_seq_token': self.begin_seq_token, 'end_seq_token': self.end_seq_token}) return contents def lookup_token(self, token): return self.token_to_idx.get(token, self.unk_index) def lookup_index(self, index): if index not in self.idx_to_token: raise KeyError("the index (%d) is not in the SequenceVocabulary" % index) return self.idx_to_token[index] def __str__(self): return "" % len(self.token_to_idx) def __len__(self): return len(self.token_to_idx) # Get word counts word_counts = Counter() for title in split_df.title: for token in title.split(" "): if token not in string.punctuation: word_counts[token] += 1 # Create SequenceVocabulary instance title_vocab = SequenceVocabulary() for word, word_count in word_counts.items(): if word_count >= args.cutoff: title_vocab.add_token(word) print (title_vocab) # __str__ print (len(title_vocab)) # __len__ index = title_vocab.lookup_token("general") print (index) print (title_vocab.lookup_index(index)) class NewsVectorizer(object): def __init__(self, title_vocab, category_vocab): self.title_vocab = title_vocab self.category_vocab = category_vocab def vectorize(self, title): indices = [self.title_vocab.lookup_token(token) for token in title.split(" ")] indices = [self.title_vocab.begin_seq_index] + indices + \ [self.title_vocab.end_seq_index] # Create vector title_length = len(indices) vector = np.zeros(title_length, dtype=np.int64) vector[:len(indices)] = indices return vector, title_length def unvectorize(self, vector): tokens = [self.title_vocab.lookup_index(index) for index in vector] title = " ".join(token for token in tokens) return title @classmethod def from_dataframe(cls, df, cutoff): # Create class vocab category_vocab = Vocabulary() for category in sorted(set(df.category)): category_vocab.add_token(category) # Get word counts word_counts = Counter() for title in df.title: for token in title.split(" "): word_counts[token] += 1 # Create title vocab title_vocab = SequenceVocabulary() for word, word_count in word_counts.items(): if word_count >= cutoff: title_vocab.add_token(word) return cls(title_vocab, category_vocab) @classmethod def from_serializable(cls, contents): title_vocab = SequenceVocabulary.from_serializable(contents['title_vocab']) category_vocab = Vocabulary.from_serializable(contents['category_vocab']) return cls(title_vocab=title_vocab, category_vocab=category_vocab) def to_serializable(self): return {'title_vocab': self.title_vocab.to_serializable(), 'category_vocab': self.category_vocab.to_serializable()} # Vectorizer instance vectorizer = NewsVectorizer.from_dataframe(split_df, cutoff=args.cutoff) print (vectorizer.title_vocab) print (vectorizer.category_vocab) vectorized_title, title_length = vectorizer.vectorize(preprocess_text( "Roger Federer wins the Wimbledon tennis tournament.")) print (np.shape(vectorized_title)) print ("title_length:", title_length) print (vectorized_title) print (vectorizer.unvectorize(vectorized_title)) from torch.utils.data import Dataset, DataLoader class NewsDataset(Dataset): def __init__(self, df, vectorizer): self.df = df self.vectorizer = vectorizer # Data splits self.train_df = self.df[self.df.split=='train'] self.train_size = len(self.train_df) self.val_df = self.df[self.df.split=='val'] self.val_size = len(self.val_df) self.test_df = self.df[self.df.split=='test'] self.test_size = len(self.test_df) self.lookup_dict = {'train': (self.train_df, self.train_size), 'val': (self.val_df, self.val_size), 'test': (self.test_df, self.test_size)} self.set_split('train') # Class weights (for imbalances) class_counts = df.category.value_counts().to_dict() def sort_key(item): return self.vectorizer.category_vocab.lookup_token(item[0]) sorted_counts = sorted(class_counts.items(), key=sort_key) frequencies = [count for _, count in sorted_counts] self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32) @classmethod def load_dataset_and_make_vectorizer(cls, split_data_file, cutoff): df = pd.read_csv(split_data_file, header=0) train_df = df[df.split=='train'] return cls(df, NewsVectorizer.from_dataframe(train_df, cutoff)) @classmethod def load_dataset_and_load_vectorizer(cls, split_data_file, vectorizer_filepath): df = pd.read_csv(split_data_file, header=0) vectorizer = cls.load_vectorizer_only(vectorizer_filepath) return cls(df, vectorizer) def load_vectorizer_only(vectorizer_filepath): with open(vectorizer_filepath) as fp: return NewsVectorizer.from_serializable(json.load(fp)) def save_vectorizer(self, vectorizer_filepath): with open(vectorizer_filepath, "w") as fp: json.dump(self.vectorizer.to_serializable(), fp) def set_split(self, split="train"): self.target_split = split self.target_df, self.target_size = self.lookup_dict[split] def __str__(self): return "