DEV Community

Cover image for What are word embeddings?
Aman Gupta
Aman Gupta

Posted on

What are word embeddings?

Word embeddings

  • In natural language processing (NLP), word embeddings are numerical representations of words. They aim to capture the meaning and relationships between words in a vector space, where similar words are closer together in the space. This helps machines understand the context and meaning of words in text data.

  • Using IMDB review dataset to produce word embeddings, downloading and splitting that dataset into training and testing along with labels

    import tensorflow_datasets as tfds
    
    # Load the IMDB Reviews dataset
    imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)
    
    # Print information about the dataset
    print(info)
    
    # Print the contents of the dataset you downloaded
    print(imdb)
    
    import numpy as np
    
    # Get the train and test sets
    train_data, test_data = imdb['train'], imdb['test']
    
    # Initialize sentences and labels lists
    training_sentences = []
    training_labels = []
    
    testing_sentences = []
    testing_labels = []
    
    # Loop over all training examples and save the sentences and labels
    for s,l in train_data:
      training_sentences.append(s.numpy().decode('utf8'))
      training_labels.append(l.numpy())
    
    # Loop over all test examples and save the sentences and labels
    for s,l in test_data:
      testing_sentences.append(s.numpy().decode('utf8'))
      testing_labels.append(l.numpy())
    
    # Convert labels lists to numpy array
    training_labels_final = np.array(training_labels)
    testing_labels_final = np.array(testing_labels)
    
  • Generating padded sequence: tokenizing the text and then padding it to an equal length to feed into the model

    # Parameters
    
    vocab_size = 10000
    max_length = 120
    embedding_dim = 16
    trunc_type='post'
    oov_tok = "<OOV>"
    
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    
    # Initialize the Tokenizer class
    tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
    
    # Generate the word index dictionary for the training sentences
    tokenizer.fit_on_texts(training_sentences)
    word_index = tokenizer.word_index
    
    # Generate and pad the training sequences
    sequences = tokenizer.texts_to_sequences(training_sentences)
    padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)
    
    # Generate and pad the test sequences
    testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
    testing_padded = pad_sequences(testing_sequences,maxlen=max_length, truncating=trunc_type)
    
  • Building and training the model: using embedding and dense layers

    import tensorflow as tf
    
    # Build the model
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(6, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    # Setup the training parameters
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    # Print the model summary
    model.summary()
    
    num_epochs = 10
    
    # Train the model
    model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))
    

Image description

  • If you ever get a error like this logits and labels must have the same shape, received ((None, 5) vs (None, 1))

  • Then try changing the loss function to sparse_categorical_crossentropy

  • Visualising embeddings

    # Get the embedding layer from the model (i.e. first layer)
    embedding_layer = model.layers[0]
    
    # Get the weights of the embedding layer
    embedding_weights = embedding_layer.get_weights()[0]
    
    # Print the shape. Expected is (vocab_size, embedding_dim)
    print(embedding_weights.shape)
    
    # Get the index-word dictionary
    reverse_word_index = tokenizer.index_word
    
    import io
    
    # Open writeable files
    out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
    out_m = io.open('meta.tsv', 'w', encoding='utf-8')
    
    # Initialize the loop. Start counting at `1` because `0` is just for the padding
    for word_num in range(1, vocab_size):
    
      # Get the word associated at the current index
      word_name = reverse_word_index[word_num]
    
      # Get the embedding weights associated with the current index
      word_embedding = embedding_weights[word_num]
    
      # Write the word name
      out_m.write(word_name + "\n")
    
      # Write the word embedding
      out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")
    
    # Close the files
    out_v.close()
    out_m.close()
    
  • Download the files

    • vecs.tsv - contains the vector weights of each word in the vocabulary
    • meta.tsv - contains the words in the vocabulary
    # Import files utilities in Colab
    try:
      from google.colab import files
    except ImportError:
      pass
    
    # Download the files
    else:
      files.download('vecs.tsv')
      files.download('meta.tsv')
    
  • https://projector.tensorflow.org

  • Training a binary classifier for sarcasm dataset

  • Data processing: downloading and splitting the dataset into test and training

    # Download the dataset
    !wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json
    
    import json
    
    # Load the JSON file
    with open("./sarcasm.json", 'r') as f:
        datastore = json.load(f)
    
    # Initialize the lists
    sentences = []
    labels = []
    
    # Collect sentences and labels into the lists
    for item in datastore:
        sentences.append(item['headline'])
        labels.append(item['is_sarcastic'])
    
    # Number of examples to use for training
    training_size = 20000
    
    # Vocabulary size of the tokenizer
    vocab_size = 10000
    
    # Maximum length of the padded sequences
    max_length = 32
    
    # Output dimensions of the Embedding layer
    embedding_dim = 16
    
    # Split the sentences
    training_sentences = sentences[0:training_size]
    testing_sentences = sentences[training_size:]
    
    # Split the labels
    training_labels = labels[0:training_size]
    testing_labels = labels[training_size:]
    
  • Tokenizing

    import numpy as np
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    
    # Parameters for padding and OOV tokens
    trunc_type='post'
    padding_type='post'
    oov_tok = "<OOV>"
    
    # Initialize the Tokenizer class
    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
    
    # Generate the word index dictionary
    tokenizer.fit_on_texts(training_sentences)
    word_index = tokenizer.word_index
    
    # Generate and pad the training sequences
    training_sequences = tokenizer.texts_to_sequences(training_sentences)
    training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    
    # Generate and pad the testing sequences
    testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
    testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    
    # Convert the labels lists into numpy arrays
    training_labels = np.array(training_labels)
    testing_labels = np.array(testing_labels)
    
  • Model definition

    # Build the model
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(24, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    # Print the model summary
    model.summary()
    
    # Compile the model
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    num_epochs = 30
    
    # Train the model
    history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)
    
  • Visualizing GlobalAveragePooling1D: GlobalAveragePooling1D is a type of pooling layer commonly used in deep learning, specifically for processing one-dimensional (1D) data such as audio signals or time series data. It works by taking the average of all the input elements across the dimension of interest, resulting in a single scalar value for each feature map.

    import tensorflow as tf
    
    # Initialize a GlobalAveragePooling1D (GAP1D) layer
    gap1d_layer = tf.keras.layers.GlobalAveragePooling1D()
    
    # Define sample array
    sample_array = np.array([[[10,2],[1,3],[1,1]]])
    
    # Print shape and contents of sample array
    print(f'shape of sample_array = {sample_array.shape}')
    print(f'sample array: {sample_array}')
    
    # Pass the sample array to the GAP1D layer
    output = gap1d_layer(sample_array)
    
    # Print shape and contents of the GAP1D output array
    print(f'output shape of gap1d_layer: {output.shape}')
    print(f'output array of gap1d_layer: {output.numpy()}')
    
  • Visualizing results

    import matplotlib.pyplot as plt
    
    # Plot utility
    def plot_graphs(history, string):
      plt.plot(history.history[string])
      plt.plot(history.history['val_'+string])
      plt.xlabel("Epochs")
      plt.ylabel(string)
      plt.legend([string, 'val_'+string])
      plt.show()
    
    # Plot the accuracy and loss
    plot_graphs(history, "accuracy")
    plot_graphs(history, "loss")
    
  • Using sub-words text encoder

    import tensorflow_datasets as tfds
    
    # Download the plain text default config
    imdb_plaintext, info_plaintext = tfds.load("imdb_reviews", with_info=True, as_supervised=True)
    
    # Download the subword encoded pretokenized dataset
    imdb_subwords, info_subwords = tfds.load("imdb_reviews/subwords8k", with_info=True, as_supervised=True)
    
    
  • Processing data

    # Get the train set
    train_data = imdb_plaintext['train']
    
    # Initialize sentences list
    training_sentences = []
    
    # Loop over all training examples and save to the list
    for s,_ in train_data:
      training_sentences.append(s.numpy().decode('utf8'))
    
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    
    vocab_size = 10000
    oov_tok = '<OOV>'
    
    # Initialize the Tokenizer class
    tokenizer_plaintext = Tokenizer(num_words = 10000, oov_token=oov_tok)
    
    # Generate the word index dictionary for the training sentences
    tokenizer_plaintext.fit_on_texts(training_sentences)
    
    # Generate the training sequences
    sequences = tokenizer_plaintext.texts_to_sequences(training_sentences)
    
    # Decode the first sequence using thea Tokenizer class
    tokenizer_plaintext.sequences_to_texts(sequences[0:1])
    
    # Total number of words in the word index dictionary
    len(tokenizer_plaintext.word_index)
    
    # Print the subwords
    print(tokenizer_subwords.subwords)
    
    # Encode the first plaintext sentence using the subword text encoder
    tokenized_string = tokenizer_subwords.encode(training_sentences[0])
    print(tokenized_string)
    
    # Decode the sequence
    original_string = tokenizer_subwords.decode(tokenized_string)
    
    # Print the result
    print (original_string)
    
    
  • Comparing word and sub word embeddings

    # Define sample sentence
    sample_string = 'TensorFlow, from basics to mastery'
    
    # Encode using the plain text tokenizer
    tokenized_string = tokenizer_plaintext.texts_to_sequences([sample_string])
    print ('Tokenized string is {}'.format(tokenized_string))
    
    # Decode and print the result
    original_string = tokenizer_plaintext.sequences_to_texts(tokenized_string)
    print ('The original string: {}'.format(original_string))
    
```python
# Encode using the subword text encoder
tokenized_string = tokenizer_subwords.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

# Decode and print the results
original_string = tokenizer_subwords.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

# Show token to subword mapping:
for ts in tokenized_string:
  print ('{} ----> {}'.format(ts, tokenizer_subwords.decode([ts])))
```
Enter fullscreen mode Exit fullscreen mode

Image description

  • Training the model

    BUFFER_SIZE = 10000
    BATCH_SIZE = 64
    
    # Get the train and test splits
    train_data, test_data = imdb_subwords['train'], imdb_subwords['test'], 
    
    # Shuffle the training data
    train_dataset = train_data.shuffle(BUFFER_SIZE)
    
    # Batch and pad the datasets to the maximum length of the sequences
    train_dataset = train_dataset.padded_batch(BATCH_SIZE)
    test_dataset = test_data.padded_batch(BATCH_SIZE)
    
    import tensorflow as tf
    
    # Define dimensionality of the embedding
    embedding_dim = 64
    
    # Build the model
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(tokenizer_subwords.vocab_size, embedding_dim),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(6, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    # Print the model summary
    model.summary()
    
    num_epochs = 10
    
    # Set the training parameters
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    # Start training
    history = model.fit(train_dataset, epochs=num_epochs, validation_data=test_dataset)
    
  • Plotting the accuracy and loss

    import matplotlib.pyplot as plt
    
    # Plot utility
    def plot_graphs(history, string):
      plt.plot(history.history[string])
      plt.plot(history.history['val_'+string])
      plt.xlabel("Epochs")
      plt.ylabel(string)
      plt.legend([string, 'val_'+string])
      plt.show()
    
    # Plot the accuracy and results 
    plot_graphs(history, "accuracy")
    plot_graphs(history, "loss")
    

Image description

Thank you for reading :)

Top comments (0)