#!/usr/bin/env python # coding: utf-8 # # TF-IDF for tabular data featurization and classification # # This notebook describes applying TF-IDF to database columns. It generate some synthetic data, featurizers the data, applies TF-IDF and visualizes the results with t-SNE. There is a corresponding blog post at [LINK HERE] # #### Load some libraries and set some plotting configurations # In[1]: import numpy as np import pandas as pd import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') from matplotlib import pyplot as plt # In[2]: sns.set() sns.set_style("whitegrid") sns.set_context("paper", rc={"font.size":8,"axes.labelsize":10, "xtick.labelsize":10, "ytick.labelsize":10}) # In[3]: from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.manifold import TSNE # In[4]: from collections import OrderedDict, Counter # In[5]: from faker import Faker fake = Faker() fake.seed(101) # #### Some useful functions and variables # In[6]: synthetic_data_factories = OrderedDict({"email": fake.email, "phone_number": fake.phone_number, "float_number": lambda: str(np.random.uniform(-180, 180)), "binary_number": lambda: str(np.random.randint(0, 2)), "UUID": fake.uuid4}) # In[7]: placeholders = OrderedDict({"-": 1, "": 1, "0": 1}) # In[8]: def simulated_column_data(factory, num_rows, placeholders, placeholder_probability=0): column = [] placeholder_selection_frequency = [ val / sum(placeholders.values()) for val in placeholders.values() ] for _ in range(num_rows): is_placeholder = np.random.uniform(0, 1) < placeholder_probability placeholder_value = np.random.choice( list(placeholders.keys()), size=1, p=placeholder_selection_frequency)[0] if is_placeholder: cell_value = placeholder_value else: cell_value = factory() column.append(cell_value) return column # #### generate the synthetic data # In[9]: dataframe_list = [] placeholder_prob = 0.9 columns_per_class = 20 labels = [] for data_class, synthetic_data_factory in synthetic_data_factories.items(): for column_idx in range(columns_per_class): labels.append(data_class) df = pd.DataFrame( {"column_name": "{}{:03d}".format(data_class, column_idx+1), "data_class": data_class, "cell_value": simulated_column_data( synthetic_data_factory, 100, placeholders, placeholder_prob ) } ) dataframe_list.append(df) df = pd.concat(dataframe_list, axis=0) labels = np.array(labels) # The most common values. Note that values are typically very common (placeholders) or unique (authentic data) # In[10]: Counter(df.cell_value).most_common(10) # #### Featurizers # In[11]: import re import uuid import phonenumbers def regex_feature(value, patt): return re.search(patt, value.strip().lower()) is not None def matches_email(value): patt = ( r"^([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*" # Doesn't start with a period r"(@|\sat\s)" # requires `at` clause r"(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?" # domain name doesn't start or end with a hyphen r"(\.|\sdot\s))+" # one or more second level domains r"[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)$" # top level domain doesn't start or end in a hyphen ) return regex_feature(value, patt) def matches_uuid(value): try: uuid.UUID(value) return True except ValueError: return False def matches_phone_number(value): regions = ["US", None] for region in regions: try: phone_number = phonenumbers.parse(value, region) if phonenumbers.is_valid_number(phone_number): return True except phonenumbers.NumberParseException: pass return False def is_float(value): try: _ = float(value) return True except: return False def is_int(value): try: _ = int(value) return True except: return False def is_len1(value): return len(value) == 1 # In[12]: featurizers = OrderedDict({ "email": matches_email, "uuid": matches_uuid, "phone_number": matches_phone_number, "is_float": is_float, "is_int": is_int, "is_len1": is_len1 }) # In[13]: def featurize(value, featurizers): return OrderedDict({feature_name: feature_value(value) for feature_name, feature_value in featurizers.items()}) def stringify_features(value, featurizers): return ''.join([str(int(binary_feature)) for binary_feature in featurize(value, featurizers).values()]) # Here are a few examples of featurization # In[14]: featurize("---", featurizers) # In[15]: featurize("-", featurizers) # In[16]: featurize("18005551212", featurizers) # An example of stringifying a feature set # In[17]: stringify_features("18005551212", featurizers) # Apply the featurizers to the data and update the data frame. The Boolean features are turned into strings of 0 and 1 values which then form the **words** of our document set. # In[18]: df.loc[:, "word"] = df.cell_value.apply(lambda x: stringify_features(x, featurizers)) # The most common feature words. Note that the bifurcation into placeholders and unique values has been removed. # In[19]: Counter(df.word).most_common() # #### Generating documents # # Here we generate documents composed of feature words # In[20]: # for actual terms docs_value = [] for column_name in df.column_name.unique(): docs_value.append( list( df.query("column_name == \"{}\"".format(column_name)).cell_value ) ) # In[21]: # for feature words docs_words = [] for column_name in df.column_name.unique(): docs_words.append( list( df.query("column_name == \"{}\"".format(column_name)).word ) ) # In[22]: # for unique values, i.e. sub-linear (binary) term frequency docs_unique = [] for column_name in df.column_name.unique(): docs_unique.append( list( np.unique(df.query("column_name == \"{}\"".format(column_name)).word) ) ) # #### TF-IDF vectorization # In[23]: # need an identity function to pass to TF-IDF # since we've already parsed the terms into documents def identity_fun(x): return x # Here we define the TF-IDF vectorizer. The relevant arguments are: # # * `analyzer`: we analyze at the word level, as opposed to character n-gram level # * `preprocessor`: function to preprocess the values. Because we've already processed them, the identity function is used. # * `tokenizer`: function to tokenize the values. Because we've already tokenized them, the identity function is used. # * `token_pattern`: regular expression to tokenize the inputs. Because we've already tokenized them, the identity function is used. # * `sublinear_tf`: Boolean to apply sublinear (logarithmic) term-frequency. A sublinear (binary) term-frequency is more appropriate for this problem so we apply it in a preporocessing step below. # * `smooth_idf`: Boolean to apply smooth inverse-document-frequency. # In[24]: # Instantiate the TF-IDF vectorizer tfidf_mod = TfidfVectorizer( analyzer='word', preprocessor=identity_fun, tokenizer=identity_fun, token_pattern=None, sublinear_tf=False, smooth_idf=True ) # In[25]: # generate the TF-IDF vectors for each of the three variations of document tfidf_value = tfidf_mod.fit_transform(docs_value) tfidf_words = tfidf_mod.fit_transform(docs_words) tfidf_unique = tfidf_mod.fit_transform(docs_unique) # #### Visualizing results with T-SNE plotting # # Here we apply t-distributed stochastic neighbor embedding (t-SNE) to the TF-IDF vectors to visualize similarities. This algorithm provides a way of visualizing similarities between points in a high dimensional space by projecting them to a lower dimensional space (e.g. 2-dimensions) while approximately maintaining relative distances. # In[26]: tsne_mod = TSNE(metric="cosine", perplexity=20) # The actual-terms documents # In[27]: Y = tsne_mod.fit_transform(tfidf_value.todense()) # In[28]: pl_df = pd.DataFrame(Y).rename(columns={0: "x1", 1: "x2"}) pl_df = pl_df.assign(data_class = labels) pl_df.to_csv("tsne_terms.csv", index=False) # In[29]: sns.scatterplot(x="x1", y="x2", hue="data_class", style="data_class", data=pl_df) plt.title("t-SNE: document actual terms") # The feature-words documents # In[30]: Y = tsne_mod.fit_transform(tfidf_words.todense()) # In[31]: pl_df = pd.DataFrame(Y).rename(columns={0: "x1", 1: "x2"}) pl_df = pl_df.assign(data_class = labels) pl_df.to_csv("tsne_tfidf.csv", index=False) # In[32]: a = sns.scatterplot(x="x1", y="x2", hue="data_class", style="data_class", data=pl_df) plt.title("t-SNE: document feature words") # The feature-words documents with sub-linear (binary) term frequencies # In[33]: Y = tsne_mod.fit_transform(tfidf_unique.todense()) # In[34]: pl_df = pd.DataFrame(Y).rename(columns={0: "x1", 1: "x2"}) pl_df = pl_df.assign(data_class = labels) pl_df.to_csv("tsne_tfidf_uniq.csv", index=False) # In[35]: a = sns.scatterplot(x="x1", y="x2", hue="data_class", style="data_class", data=pl_df) plt.title("t-SNE: document feature words with binary term-frequency") # The inverse-document values for the feature words, and the actual terms # In[36]: tfidf_mod.fit_transform(docs_words) tfidf_mod.idf_ # In[37]: tfidf_mod.fit_transform(docs_value) tfidf_mod.idf_[0:5]