import tensorflow as tf
import json
import tensorflow_datasets as tfds
from tensorflow.keras.utils import pad_sequences

# Download the dataset
!wget -nc https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json

File ‘sarcasm.json’ already there; not retrieving.

# Load the JSON file
with open("./sarcasm.json", 'r') as f:
    datastore = json.load(f)

# Non-sarcastic headline
print(datastore[0])

# Sarcastic headline
print(datastore[20000])

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}
{'article_link': 'https://www.theonion.com/pediatricians-announce-2011-newborns-are-ugliest-babies-1819572977', 'headline': 'pediatricians announce 2011 newborns are ugliest babies in 30 years', 'is_sarcastic': 1}

# Append the headline elements into the list
sentences = [item['headline'] for item in datastore]

# Instantiate the layer
vectorize_layer = tf.keras.layers.TextVectorization()

# Build the vocabulary
vectorize_layer.adapt(sentences)

# Apply the layer for post padding
post_padded_sequences = vectorize_layer(sentences)

# Print a sample headline and sequence
index = 2
print(f'sample headline: {sentences[index]}')
print(f'padded sequence: {post_padded_sequences[index]}')
print()

# Print dimensions of padded sequences
print(f'shape of padded sequences: {post_padded_sequences.shape}')

sample headline: mom starting to fear son's web series closest thing she will have to grandchild
padded sequence: [  140   825     2   813  1100  2048   571  5057   199   139    39    46
     2 13050     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0]

shape of padded sequences: (26709, 39)

# Instantiate the layer and set the `ragged` flag to `True`
vectorize_layer = tf.keras.layers.TextVectorization(ragged=True)

# Build the vocabulary
vectorize_layer.adapt(sentences)

# Apply the layer to generate a ragged tensor
ragged_sequences = vectorize_layer(sentences)

# Print a sample headline and sequence
index = 2
print(f'sample headline: {sentences[index]}')
print(f'padded sequence: {ragged_sequences[index]}')
print()

# Print dimensions of padded sequences
print(f'shape of padded sequences: {ragged_sequences.shape}')

sample headline: mom starting to fear son's web series closest thing she will have to grandchild
padded sequence: [  140   825     2   813  1100  2048   571  5057   199   139    39    46
     2 13050]

shape of padded sequences: (26709, None)

# Apply pre-padding to the ragged tensor
pre_padded_sequences = pad_sequences(ragged_sequences.numpy())

# Preview the result for the 2nd sequence
pre_padded_sequences[2]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,   140,   825,
           2,   813,  1100,  2048,   571,  5057,   199,   139,    39,
          46,     2, 13050], dtype=int32)

# Print a sample headline and sequence
index = 2
print(f'sample headline: {sentences[index]}')
print()
print(f'post-padded sequence: {post_padded_sequences[index]}')
print()
print(f'pre-padded sequence: {pre_padded_sequences[index]}')
print()

# Print dimensions of padded sequences
print(f'shape of post-padded sequences: {post_padded_sequences.shape}')
print(f'shape of pre-padded sequences: {pre_padded_sequences.shape}')

sample headline: mom starting to fear son's web series closest thing she will have to grandchild

post-padded sequence: [  140   825     2   813  1100  2048   571  5057   199   139    39    46
     2 13050     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0]

pre-padded sequence: [    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0   140   825     2   813  1100  2048   571  5057   199   139    39
    46     2 13050]

shape of post-padded sequences: (26709, 39)
shape of pre-padded sequences: (26709, 39)

Ungraded Lab: Tokenizing the Sarcasm Dataset¶

Imports¶

Download and inspect the dataset¶

Preprocessing the headlines¶