import json
import io
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

# Loading the sarcasm dataset

with open('Sarcasm_Headlines_Dataset.json', 'r') as f:
    datastore = [json.loads(line) for line in f]

sentences, labels = [], []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

# Number of examples to use for training
TRAINING_SIZE = 20000

# Vocabulary size of the tokenizer
VOCAB_SIZE = 10000

# Maximum length of the padded sequences
MAX_LENGTH = 32

# Output dimensions of the Embedding layer
EMBEDDING_DIM = 16

# Split the sentences
train_sentences = sentences[0:TRAINING_SIZE]
test_sentences = sentences[TRAINING_SIZE:]

# Split the labels
train_labels = labels[0:TRAINING_SIZE]
test_labels = labels[TRAINING_SIZE:]

# Instantiate the vectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_sequence_length=MAX_LENGTH)

# Generate the vocabulary based on the training inputs
vectorize_layer.adapt(train_sentences)

# Apply the vectorization layer on the train and test inputs
train_sequences = vectorize_layer(train_sentences)
test_sequences = vectorize_layer(test_sentences)

train_sequences.shape

TensorShape([20000, 32])

# Now you will combine the inputs and labels into a `tf.data.Dataset` to prepare it for training.
# Combine input-output pairs for training

train_dataset_vectorized = tf.data.Dataset.from_tensor_slices((train_sequences,train_labels))
test_dataset_vectorized = tf.data.Dataset.from_tensor_slices((test_sequences,test_labels))

# View 2 examples
for example in train_dataset_vectorized.take(2):
  print(example)
  print()

(<tf.Tensor: shape=(32,), dtype=int64, numpy=
array([ 319,    1,  943, 4079, 2366,   47,  366,   94, 2026,    6, 2653,
       9469,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
      dtype=int64)>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)

(<tf.Tensor: shape=(32,), dtype=int64, numpy=
array([   4, 7185, 3128, 3305,   28,    2,  152,    1,  358, 2902,    6,
        236,    9,  843,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
      dtype=int64)>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)

SHUFFLE_BUFFER_SIZE = 1000
PREFETCH_BUFFER_SIZE = tf.data.AUTOTUNE
BATCH_SIZE = 32

# Optimize the datasets for training
train_dataset_final = (train_dataset_vectorized
                       .cache()
                       .shuffle(SHUFFLE_BUFFER_SIZE)
                       .prefetch(PREFETCH_BUFFER_SIZE)
                       .batch(BATCH_SIZE)
                       )

test_dataset_final = (test_dataset_vectorized
                      .cache()
                      .prefetch(PREFETCH_BUFFER_SIZE)
                      .batch(BATCH_SIZE)
                      )

# Initialize a GlobalAveragePooling1D (GAP1D) layer
gap1d_layer = tf.keras.layers.GlobalAveragePooling1D()

# Define sample array
sample_array = np.array([[[10,2],[1,3],[1,1]]])

# Print shape and contents of sample array
print(f'shape of sample_array = {sample_array.shape}')
print(f'sample array: {sample_array}')

# Pass the sample array to the GAP1D layer
output = gap1d_layer(sample_array)

# Print shape and contents of the GAP1D output array
print(f'output shape of gap1d_layer: {output.shape}')
print(f'output array of gap1d_layer: {output.numpy()}')

shape of sample_array = (1, 3, 2)
sample array: [[[10  2]
  [ 1  3]
  [ 1  1]]]
output shape of gap1d_layer: (1, 2)
output array of gap1d_layer: [[4 2]]

# Build the model
model = tf.keras.Sequential([
    tf.keras.Input(shape=(MAX_LENGTH,)),
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Print the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 32, 16)            160000    
                                                                 
 global_average_pooling1d_1   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense (Dense)               (None, 24)                408       
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
=================================================================
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________

# Compile the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

num_epochs = 10

# Train the model
history = model.fit(train_dataset_final, epochs=num_epochs, validation_data=test_dataset_final, verbose=2)

Epoch 1/10
625/625 - 6s - loss: 0.5753 - accuracy: 0.6921 - val_loss: 0.4137 - val_accuracy: 0.8193 - 6s/epoch - 10ms/step
Epoch 2/10
625/625 - 3s - loss: 0.3198 - accuracy: 0.8719 - val_loss: 0.3609 - val_accuracy: 0.8371 - 3s/epoch - 4ms/step
Epoch 3/10
625/625 - 3s - loss: 0.2424 - accuracy: 0.9046 - val_loss: 0.3504 - val_accuracy: 0.8498 - 3s/epoch - 4ms/step
Epoch 4/10
625/625 - 3s - loss: 0.1966 - accuracy: 0.9263 - val_loss: 0.3683 - val_accuracy: 0.8512 - 3s/epoch - 4ms/step
Epoch 5/10
625/625 - 3s - loss: 0.1647 - accuracy: 0.9395 - val_loss: 0.3934 - val_accuracy: 0.8498 - 3s/epoch - 4ms/step
Epoch 6/10
625/625 - 3s - loss: 0.1401 - accuracy: 0.9514 - val_loss: 0.4240 - val_accuracy: 0.8465 - 3s/epoch - 4ms/step
Epoch 7/10
625/625 - 2s - loss: 0.1216 - accuracy: 0.9596 - val_loss: 0.4596 - val_accuracy: 0.8435 - 2s/epoch - 4ms/step
Epoch 8/10
625/625 - 3s - loss: 0.1079 - accuracy: 0.9650 - val_loss: 0.5169 - val_accuracy: 0.8329 - 3s/epoch - 4ms/step
Epoch 9/10
625/625 - 3s - loss: 0.0945 - accuracy: 0.9699 - val_loss: 0.5362 - val_accuracy: 0.8320 - 3s/epoch - 4ms/step
Epoch 10/10
625/625 - 3s - loss: 0.0821 - accuracy: 0.9747 - val_loss: 0.5922 - val_accuracy: 0.8278 - 3s/epoch - 4ms/step

# Plot utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and loss
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")