Word embeddings
In natural language processing (NLP), word embeddings are numerical representations of words. They aim to capture the meaning and relationships between words in a vector space, where similar words are closer together in the space. This helps machines understand the context and meaning of words in text data.
-
Using IMDB review dataset to produce word embeddings, downloading and splitting that dataset into training and testing along with labels
import tensorflow_datasets as tfds # Load the IMDB Reviews dataset imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True) # Print information about the dataset print(info) # Print the contents of the dataset you downloaded print(imdb) import numpy as np # Get the train and test sets train_data, test_data = imdb['train'], imdb['test'] # Initialize sentences and labels lists training_sentences = [] training_labels = [] testing_sentences = [] testing_labels = [] # Loop over all training examples and save the sentences and labels for s,l in train_data: training_sentences.append(s.numpy().decode('utf8')) training_labels.append(l.numpy()) # Loop over all test examples and save the sentences and labels for s,l in test_data: testing_sentences.append(s.numpy().decode('utf8')) testing_labels.append(l.numpy()) # Convert labels lists to numpy array training_labels_final = np.array(training_labels) testing_labels_final = np.array(testing_labels)
-
Generating padded sequence: tokenizing the text and then padding it to an equal length to feed into the model
# Parameters vocab_size = 10000 max_length = 120 embedding_dim = 16 trunc_type='post' oov_tok = "<OOV>" from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences # Initialize the Tokenizer class tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok) # Generate the word index dictionary for the training sentences tokenizer.fit_on_texts(training_sentences) word_index = tokenizer.word_index # Generate and pad the training sequences sequences = tokenizer.texts_to_sequences(training_sentences) padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type) # Generate and pad the test sequences testing_sequences = tokenizer.texts_to_sequences(testing_sentences) testing_padded = pad_sequences(testing_sequences,maxlen=max_length, truncating=trunc_type)
-
Building and training the model: using embedding and dense layers
import tensorflow as tf # Build the model model = tf.keras.Sequential([ tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), tf.keras.layers.Flatten(), tf.keras.layers.Dense(6, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid') ]) # Setup the training parameters model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) # Print the model summary model.summary() num_epochs = 10 # Train the model model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))
If you ever get a error like this
logits
andlabels
must have the same shape, received ((None, 5) vs (None, 1))Then try changing the loss function to
sparse_categorical_crossentropy
-
Visualising embeddings
# Get the embedding layer from the model (i.e. first layer) embedding_layer = model.layers[0] # Get the weights of the embedding layer embedding_weights = embedding_layer.get_weights()[0] # Print the shape. Expected is (vocab_size, embedding_dim) print(embedding_weights.shape) # Get the index-word dictionary reverse_word_index = tokenizer.index_word import io # Open writeable files out_v = io.open('vecs.tsv', 'w', encoding='utf-8') out_m = io.open('meta.tsv', 'w', encoding='utf-8') # Initialize the loop. Start counting at `1` because `0` is just for the padding for word_num in range(1, vocab_size): # Get the word associated at the current index word_name = reverse_word_index[word_num] # Get the embedding weights associated with the current index word_embedding = embedding_weights[word_num] # Write the word name out_m.write(word_name + "\n") # Write the word embedding out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n") # Close the files out_v.close() out_m.close()
-
Download the files
-
vecs.tsv
- contains the vector weights of each word in the vocabulary -
meta.tsv
- contains the words in the vocabulary
# Import files utilities in Colab try: from google.colab import files except ImportError: pass # Download the files else: files.download('vecs.tsv') files.download('meta.tsv')
-
Training a binary classifier for sarcasm dataset
-
Data processing: downloading and splitting the dataset into test and training
# Download the dataset !wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json import json # Load the JSON file with open("./sarcasm.json", 'r') as f: datastore = json.load(f) # Initialize the lists sentences = [] labels = [] # Collect sentences and labels into the lists for item in datastore: sentences.append(item['headline']) labels.append(item['is_sarcastic']) # Number of examples to use for training training_size = 20000 # Vocabulary size of the tokenizer vocab_size = 10000 # Maximum length of the padded sequences max_length = 32 # Output dimensions of the Embedding layer embedding_dim = 16 # Split the sentences training_sentences = sentences[0:training_size] testing_sentences = sentences[training_size:] # Split the labels training_labels = labels[0:training_size] testing_labels = labels[training_size:]
-
Tokenizing
import numpy as np from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences # Parameters for padding and OOV tokens trunc_type='post' padding_type='post' oov_tok = "<OOV>" # Initialize the Tokenizer class tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) # Generate the word index dictionary tokenizer.fit_on_texts(training_sentences) word_index = tokenizer.word_index # Generate and pad the training sequences training_sequences = tokenizer.texts_to_sequences(training_sentences) training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # Generate and pad the testing sequences testing_sequences = tokenizer.texts_to_sequences(testing_sentences) testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # Convert the labels lists into numpy arrays training_labels = np.array(training_labels) testing_labels = np.array(testing_labels)
-
Model definition
# Build the model model = tf.keras.Sequential([ tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), tf.keras.layers.GlobalAveragePooling1D(), tf.keras.layers.Dense(24, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid') ]) # Print the model summary model.summary() # Compile the model model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) num_epochs = 30 # Train the model history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)
-
Visualizing GlobalAveragePooling1D: GlobalAveragePooling1D is a type of pooling layer commonly used in deep learning, specifically for processing one-dimensional (1D) data such as audio signals or time series data. It works by taking the average of all the input elements across the dimension of interest, resulting in a single scalar value for each feature map.
import tensorflow as tf # Initialize a GlobalAveragePooling1D (GAP1D) layer gap1d_layer = tf.keras.layers.GlobalAveragePooling1D() # Define sample array sample_array = np.array([[[10,2],[1,3],[1,1]]]) # Print shape and contents of sample array print(f'shape of sample_array = {sample_array.shape}') print(f'sample array: {sample_array}') # Pass the sample array to the GAP1D layer output = gap1d_layer(sample_array) # Print shape and contents of the GAP1D output array print(f'output shape of gap1d_layer: {output.shape}') print(f'output array of gap1d_layer: {output.numpy()}')
-
Visualizing results
import matplotlib.pyplot as plt # Plot utility def plot_graphs(history, string): plt.plot(history.history[string]) plt.plot(history.history['val_'+string]) plt.xlabel("Epochs") plt.ylabel(string) plt.legend([string, 'val_'+string]) plt.show() # Plot the accuracy and loss plot_graphs(history, "accuracy") plot_graphs(history, "loss")
-
Using sub-words text encoder
import tensorflow_datasets as tfds # Download the plain text default config imdb_plaintext, info_plaintext = tfds.load("imdb_reviews", with_info=True, as_supervised=True) # Download the subword encoded pretokenized dataset imdb_subwords, info_subwords = tfds.load("imdb_reviews/subwords8k", with_info=True, as_supervised=True)
-
Processing data
# Get the train set train_data = imdb_plaintext['train'] # Initialize sentences list training_sentences = [] # Loop over all training examples and save to the list for s,_ in train_data: training_sentences.append(s.numpy().decode('utf8')) from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences vocab_size = 10000 oov_tok = '<OOV>' # Initialize the Tokenizer class tokenizer_plaintext = Tokenizer(num_words = 10000, oov_token=oov_tok) # Generate the word index dictionary for the training sentences tokenizer_plaintext.fit_on_texts(training_sentences) # Generate the training sequences sequences = tokenizer_plaintext.texts_to_sequences(training_sentences) # Decode the first sequence using thea Tokenizer class tokenizer_plaintext.sequences_to_texts(sequences[0:1]) # Total number of words in the word index dictionary len(tokenizer_plaintext.word_index) # Print the subwords print(tokenizer_subwords.subwords) # Encode the first plaintext sentence using the subword text encoder tokenized_string = tokenizer_subwords.encode(training_sentences[0]) print(tokenized_string) # Decode the sequence original_string = tokenizer_subwords.decode(tokenized_string) # Print the result print (original_string)
-
Comparing word and sub word embeddings
# Define sample sentence sample_string = 'TensorFlow, from basics to mastery' # Encode using the plain text tokenizer tokenized_string = tokenizer_plaintext.texts_to_sequences([sample_string]) print ('Tokenized string is {}'.format(tokenized_string)) # Decode and print the result original_string = tokenizer_plaintext.sequences_to_texts(tokenized_string) print ('The original string: {}'.format(original_string))
```python
# Encode using the subword text encoder
tokenized_string = tokenizer_subwords.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))
# Decode and print the results
original_string = tokenizer_subwords.decode(tokenized_string)
print ('The original string: {}'.format(original_string))
# Show token to subword mapping:
for ts in tokenized_string:
print ('{} ----> {}'.format(ts, tokenizer_subwords.decode([ts])))
```
-
Training the model
BUFFER_SIZE = 10000 BATCH_SIZE = 64 # Get the train and test splits train_data, test_data = imdb_subwords['train'], imdb_subwords['test'], # Shuffle the training data train_dataset = train_data.shuffle(BUFFER_SIZE) # Batch and pad the datasets to the maximum length of the sequences train_dataset = train_dataset.padded_batch(BATCH_SIZE) test_dataset = test_data.padded_batch(BATCH_SIZE) import tensorflow as tf # Define dimensionality of the embedding embedding_dim = 64 # Build the model model = tf.keras.Sequential([ tf.keras.layers.Embedding(tokenizer_subwords.vocab_size, embedding_dim), tf.keras.layers.GlobalAveragePooling1D(), tf.keras.layers.Dense(6, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid') ]) # Print the model summary model.summary() num_epochs = 10 # Set the training parameters model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) # Start training history = model.fit(train_dataset, epochs=num_epochs, validation_data=test_dataset)
-
Plotting the accuracy and loss
import matplotlib.pyplot as plt # Plot utility def plot_graphs(history, string): plt.plot(history.history[string]) plt.plot(history.history['val_'+string]) plt.xlabel("Epochs") plt.ylabel(string) plt.legend([string, 'val_'+string]) plt.show() # Plot the accuracy and results plot_graphs(history, "accuracy") plot_graphs(history, "loss")
Thank you for reading :)
Top comments (0)