import tensorflow_datasets as tfds
import tensorflow as tf
import io

# Load the IMDB Reviews dataset
# I am setting download = True as I have to download the data in the required directory and from next time
# I can fetch it directly from folder setting download=False

imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True, data_dir="./data/", download=True)

Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to data\imdb_reviews\plain_text\1.0.0...

Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling data\imdb_reviews\plain_text\incomplete.BM143N_1.0.0\imdb_reviews-train.tfrecord*...:   0%|         …

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling data\imdb_reviews\plain_text\incomplete.BM143N_1.0.0\imdb_reviews-test.tfrecord*...:   0%|          …

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling data\imdb_reviews\plain_text\incomplete.BM143N_1.0.0\imdb_reviews-unsupervised.tfrecord*...:   0%|  …

print(info)

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset. This is a dataset for binary sentiment
    classification containing substantially more data than previous benchmark
    datasets. We provide a set of 25,000 highly polar movie reviews for training,
    and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_dir='data\\imdb_reviews\\plain_text\\1.0.0',
    file_format=tfrecord,
    download_size=Unknown size,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'text': Text(shape=(), dtype=string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'train': <SplitInfo num_examples=25000, num_shards=1>,
        'unsupervised': <SplitInfo num_examples=50000, num_shards=1>,
    },
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word Vectors for Sentiment Analysis},
      booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
      month     = {June},
      year      = {2011},
      address   = {Portland, Oregon, USA},
      publisher = {Association for Computational Linguistics},
      pages     = {142--150},
      url       = {http://www.aclweb.org/anthology/P11-1015}
    }""",
)

for example in imdb['train'].take(2):
    print(example)

(<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of what was causing them or why. I admit, I may have missed part of the film, but i watched the majority of it and everything just seemed to happen of its own accord without any real concern for anything else. I cant recommend this film at all.'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)

# Get the train and test sets
train_dataset, test_dataset = imdb['train'], imdb['test']

# Parameters

VOCAB_SIZE = 10000
MAX_LENGTH = 120
EMBEDDING_DIM = 16
PADDING_TYPE = 'pre'
TRUNC_TYPE = 'post'

# Instantiate vectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)

# Get the string inputs and integer outputs of the training set
train_reviews = train_dataset.map(lambda review, label:review)
train_label = train_dataset.map(lambda review, label:label)

# Get the string inputs and integer outputs of the test set
test_reviews = test_dataset.map(lambda review,label:review)
test_label = test_dataset.map(lambda review,label:label)

# Generate the vocabulary based only on the training set
vectorize_layer.adapt(train_reviews)

def padding_func(sequences):
    # Putting all the elements in a single ragged bacth
    sequences = sequences.padded_batch(batch_size=tf.data.experimental.cardinality(sequences).numpy(),
                                       padded_shapes=[None])
    
    # Output a tensor from a single batch
    sequences = sequences.get_single_element()
    
    # Pad the sequences
    padded_sequences = tf.keras.utils.pad_sequences(sequences.numpy(),
                                                    maxlen=MAX_LENGTH,
                                                    truncating=TRUNC_TYPE,
                                                    padding=PADDING_TYPE)
    
    # Convert back to tf.data.Dataset
    padded_sequences=tf.data.Dataset.from_tensor_slices(padded_sequences)
    
    return padded_sequences

# Apply the layer to the train and test data
train_sequences = train_reviews.map(lambda text: vectorize_layer(text)).apply(padding_func)
test_sequences = test_reviews.map(lambda text: vectorize_layer(text)).apply(padding_func)

# View 2 training sequences
for example in train_sequences.take(2):
  print(example)
  print()

tf.Tensor(
[  11   14   34  412  384   18   90   28    1    8   33 1320 3555   42
  487    1  191   24   85  152   19   11  217  317   28   65  240  215
    8  489   54   65   85  112   96   22 5652   11   93  639  741   11
   18    7   34  394 9515  170 2464  408    2   88 1216  137   66  144
   51    2    1 7552   66  245   65 2867   16    1 2858    1    1 1428
 5045    3   40    1 1581   17 3555   14  158   19    4 1216  890 8030
    8    4   18   12   14 4054    5   99  146 1240   10  237  707   12
   48   24   93   39   11 7329  152   39 1320    1   50  398   10   96
 1155  850  141    9    0    0    0    0], shape=(120,), dtype=int32)

tf.Tensor(
[  10   26   75  617    6  777 2355  299   95   19   11    7  603  662
    6    4 2128    5  180  571   63 1404  107 2408    3 3902   21    2
    1    3  253   41 4777    4  169  186   21   11 4254   10 1503 2355
   80    2   20   14 1971    2  114  942   14 1737 1297  593    3  356
  180  445    6  597   19   17   57 1772    5   49   14 3997   98   42
  134   10  933   10  194   26 1027  171    5    2   20   19   10  284
    2 2065    5    9    3  279   41  445    6  597    5   30  200    1
  201   99  146 4522   16  229  329   10  175  369   11   20   31   32
    0    0    0    0    0    0    0    0], shape=(120,), dtype=int32)

# Recombine sequences with labels -
# Zipping requires both to be in tf.data.Dataset format

train_dataset_vectorized = tf.data.Dataset.zip((train_sequences, train_label))
test_dataset_vectorized = tf.data.Dataset.zip((test_sequences, test_label))

# View 2 training sequences and its labels
for example in train_dataset_vectorized.take(2):
  print(example)
  print()

(<tf.Tensor: shape=(120,), dtype=int32, numpy=
array([  11,   14,   34,  412,  384,   18,   90,   28,    1,    8,   33,
       1320, 3555,   42,  487,    1,  191,   24,   85,  152,   19,   11,
        217,  317,   28,   65,  240,  215,    8,  489,   54,   65,   85,
        112,   96,   22, 5652,   11,   93,  639,  741,   11,   18,    7,
         34,  394, 9515,  170, 2464,  408,    2,   88, 1216,  137,   66,
        144,   51,    2,    1, 7552,   66,  245,   65, 2867,   16,    1,
       2858,    1,    1, 1428, 5045,    3,   40,    1, 1581,   17, 3555,
         14,  158,   19,    4, 1216,  890, 8030,    8,    4,   18,   12,
         14, 4054,    5,   99,  146, 1240,   10,  237,  707,   12,   48,
         24,   93,   39,   11, 7329,  152,   39, 1320,    1,   50,  398,
         10,   96, 1155,  850,  141,    9,    0,    0,    0,    0])>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)

(<tf.Tensor: shape=(120,), dtype=int32, numpy=
array([  10,   26,   75,  617,    6,  777, 2355,  299,   95,   19,   11,
          7,  603,  662,    6,    4, 2128,    5,  180,  571,   63, 1404,
        107, 2408,    3, 3902,   21,    2,    1,    3,  253,   41, 4777,
          4,  169,  186,   21,   11, 4254,   10, 1503, 2355,   80,    2,
         20,   14, 1971,    2,  114,  942,   14, 1737, 1297,  593,    3,
        356,  180,  445,    6,  597,   19,   17,   57, 1772,    5,   49,
         14, 3997,   98,   42,  134,   10,  933,   10,  194,   26, 1027,
        171,    5,    2,   20,   19,   10,  284,    2, 2065,    5,    9,
          3,  279,   41,  445,    6,  597,    5,   30,  200,    1,  201,
         99,  146, 4522,   16,  229,  329,   10,  175,  369,   11,   20,
         31,   32,    0,    0,    0,    0,    0,    0,    0,    0])>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)

# Lastly you will optimize and batch the dataset

SHUFFLE_BATCH_SIZE = 1000
PREFETCH_BATCH_SIZE = tf.data.AUTOTUNE
BATCH_SIZE = 32

train_dataset_final = (train_dataset_vectorized.cache()
                      .shuffle(SHUFFLE_BATCH_SIZE)
                      .prefetch(PREFETCH_BATCH_SIZE)
                      .batch(BATCH_SIZE))

test_dataset_final = (test_dataset_vectorized.cache()
                     .prefetch(PREFETCH_BATCH_SIZE)
                     .batch(BATCH_SIZE))

# Build the model
model = tf.keras.Sequential([
    tf.keras.Input(shape=(MAX_LENGTH,)),
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Setup the training parameters
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 120, 16)           160000    
                                                                 
 flatten (Flatten)           (None, 1920)              0         
                                                                 
 dense (Dense)               (None, 6)                 11526     
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
=================================================================
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________

NUM_EPOCHS = 5

# Train the model
model.fit(train_dataset_final, epochs=NUM_EPOCHS, validation_data=test_dataset_final)

Epoch 1/5
782/782 [==============================] - 15s 7ms/step - loss: 0.5048 - accuracy: 0.7288 - val_loss: 0.3851 - val_accuracy: 0.8261
Epoch 2/5
782/782 [==============================] - 5s 6ms/step - loss: 0.2394 - accuracy: 0.9054 - val_loss: 0.4269 - val_accuracy: 0.8194
Epoch 3/5
782/782 [==============================] - 5s 6ms/step - loss: 0.0943 - accuracy: 0.9745 - val_loss: 0.5099 - val_accuracy: 0.8128
Epoch 4/5
782/782 [==============================] - 4s 6ms/step - loss: 0.0234 - accuracy: 0.9965 - val_loss: 0.5992 - val_accuracy: 0.8093
Epoch 5/5
782/782 [==============================] - 4s 5ms/step - loss: 0.0072 - accuracy: 0.9992 - val_loss: 0.6804 - val_accuracy: 0.8055

<keras.callbacks.History at 0x1940de217b0>

# Get the embedding layer from the model (i.e. first layer)
embedding_layer = model.layers[0]

# Get the weights of the embedding layer
embedding_weights = embedding_layer.get_weights()[0]

# Print the shape. Expected is (vocab_size, embedding_dim)
print(embedding_weights.shape)

(10000, 16)

# Open writeable files
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Get the word list
vocabulary = vectorize_layer.get_vocabulary()

# Initialize the loop. Start counting at `1` because `0` is just for the padding
for word_num in range(1, len(vocabulary)):

  # Get the word associated withAttributeError the current index
  word_name = vocabulary[word_num]

  # Get the embedding weights associated with the current index
  word_embedding = embedding_weights[word_num]

  # Write the word name
  out_m.write(word_name + "\n")

  # Write the word embedding
  out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

# Close the files
out_v.close()
out_m.close()

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Reduce dimensions to 2D for visualization
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
embedding_2d_tsne = tsne.fit_transform(embedding_weights)

pca = PCA(n_components=2)
embedding_2d_pca = pca.fit_transform(embedding_weights)

plt.figure(figsize=(12, 8))
sns.scatterplot(x=embedding_2d[:, 0], y=embedding_2d[:, 1], alpha=0.6)

# Annotate some words (subset for clarity)
num_words_to_label = 200  # Adjust based on readability
for i in range(num_words_to_label):
    plt.text(embedding_2d[i, 0], embedding_2d[i, 1], vocabulary[i], fontsize=9)

plt.title("Word Embedding Visualization")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.show()

positive_words = {"good", "great", "excellent", "love", "amazing"}
negative_words = {"bad", "terrible", "worst", "hate", "awful"}

plt.figure(figsize=(12, 8))

for i, word in enumerate(vocabulary[:300]):  # Plot only first 300 words for readability
    color = "blue" if word in positive_words else "red" if word in negative_words else "gray"
    plt.scatter(embedding_2d[i, 0], embedding_2d[i, 1], color=color, alpha=0.6)
    if i < 100:  # Annotate only a few for clarity
        plt.text(embedding_2d[i, 0], embedding_2d[i, 1], word, fontsize=9)

plt.title("Word Embedding Clusters (Sentiment-based)")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.show()