Chapter 6.1.Rmd

---
title: "Chapter 6.1"
output: html_notebook
---

## one hot encoding

### word level
### listing 6.1 from book.

create the token index
```{r}
# This is our initial data; one entry per "sample"
# (in this toy example, a "sample" is just a sentence, but
# it could be an entire document).
samples <- c("The cat sat on the mat.", "The dog ate my homework.")

# First, build an index of all tokens in the data.
token_index <- list()
for (sample in samples)  {
  # Tokenizes the samples via the strsplit function. In real life, you'd also
  # strip punctuation and special characters from the samples.
  for (word in strsplit(sample, " ")[[1]]) {
    #cat("current word is: ", word, "\n")
    if (!word %in% names(token_index)) {
     # cat("word not in token index\n")
      # Assigns a unique index to each unique word. Note that you don't
      # attribute index 1 to anything.
      token_index[[word]] <- length(token_index) + 2 
      #cat("updated token index:\n")
      #print(token_index)  
    } #if
  } # for word
} # for sample

token_index
```

create the one hot encoding
```{r}
# Vectorizes the samples. You'll only consider the first max_length 
# words in each sample.
max_length <- 10

# This is where you store the results.
results <- array(0, dim = c(length(samples), 
                            max_length, 
                            max(as.integer(token_index)))) #3D array

for (i in 1:length(samples)) {
  sample <- samples[[i]]
  words <- head(strsplit(sample, " ")[[1]], n = max_length)
  for (j in 1:length(words)) {
    index <- token_index[[words[[j]]]]
    results[[i, j, index]] <- 1
  }
}

results
```

## Character level one-hot
## listing 6.2

```{r}
samples <- c("The cat sat on the mat.", "The dog ate my homework.")

ascii_tokens <- c("", sapply(as.raw(c(32:126)), rawToChar))
token_index <- c(1:(length(ascii_tokens)))
names(token_index) <- ascii_tokens

max_length <- 50

results <- array(0, dim = c(length(samples), max_length, length(token_index)))

for (i in 1:length(samples)) {
  sample <- samples[[i]]
  characters <- strsplit(sample, "")[[1]]
  for (j in 1:length(characters)) {
    character <- characters[[j]]
    results[i, j, token_index[[character]]] <- 1
  }
}
```

## keras fundctions for one hot encoding
## listing 6.3

```{r}
library(keras)
use_condaenv("r-reticulate")

samples <- c("The cat sat on the mat.", "The dog ate my homework.")

# Creates a tokenizer, configured to only take into account the 1,000 
# most common words, then builds the word index.
tokenizer <- text_tokenizer(num_words = 1000) %>%
  fit_text_tokenizer(samples)

# Turns strings into lists of integer indices
sequences <- texts_to_sequences(tokenizer, samples)

# You could also directly get the one-hot binary representations. Vectorization 
# modes other than one-hot encoding are supported by this tokenizer.
one_hot_results <- texts_to_matrix(tokenizer, samples, mode = "binary")

# How you can recover the word index that was computed
word_index <- tokenizer$word_index

cat("Found", length(word_index), "unique tokens.\n")

```

```{r}
library(hashFunction)

samples <- c("The cat sat on the mat.", "The dog ate my homework.")

# We will store our words as vectors of size 1000.
# Note that if you have close to 1000 words (or more)
# you will start seeing many hash collisions, which
# will decrease the accuracy of this encoding method.
dimensionality <- 1000
max_length <- 10

results <- array(0, dim = c(length(samples), max_length, dimensionality))

for (i in 1:length(samples)) {
  sample <- samples[[i]]
  words <- head(strsplit(sample, " ")[[1]], n = max_length)
  for (j in 1:length(words)) {
    # Hash the word into a "random" integer index
    # that is between 0 and 1,000
    index <- abs(spooky.32(words[[j]])) %% dimensionality
    results[[i, j, index]] <- 1
  }
}

results[,,1:10]
```

## Layer embedding

Here the model learns how to represent the words in a denser array

### listing 6.6
working with IMDB
```{r}
# Number of words to consider as features
max_features <- 10000
# Cut texts after this number of words 
# (among top max_features most common words)
maxlen <- 20

# Load the data as lists of integers.
imdb <- dataset_imdb(num_words = max_features)
c(c(x_train, y_train), c(x_test, y_test)) %<-% imdb

# This turns our lists of integers
# into a 2D integer tensor of shape `(samples, maxlen)`
x_train <- pad_sequences(x_train, maxlen = maxlen)
x_test <- pad_sequences(x_test, maxlen = maxlen)
dim(x_test)
head(x_test) #OK one row per review, 1 column for each of the first 20 words.  cell entries represent the integet token of the word
```

### listing 6.7
```{r, echo=TRUE, results='hide'}
model <- keras_model_sequential() %>% 
  # We specify the maximum input length to our Embedding layer
  # so we can later flatten the embedded inputs
  layer_embedding(input_dim = 10000, output_dim = 8, #10000 possible words that will be represented in 8 dimensions
                  input_length = maxlen) %>% 
  # We flatten the 3D tensor of embeddings 
  # into a 2D tensor of shape `(samples, maxlen * 8)`
  layer_flatten() %>% 
  # We add the classifier on top
  layer_dense(units = 1, activation = "sigmoid") 

model %>% compile(
  optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = c("acc")
)

history <- model %>% fit(
  x_train, y_train,
  epochs = 10,
  batch_size = 32,
  validation_split = 0.2
)
```

```{r}
plot(history)
history
```

## use someone elses embedding (raw text to word embeddings)

### listing 6.8

```{bash, eval=FALSE}
cd input
wget --no-check-certificate https://mng.bz/0tIo
```

```{bash, eval=FALSE}
cd input
unzip -q 0tIo
rm -r __MACOSX
rm 0tIo
```


```{r}
imdb_dir <- "input/aclImdb"
train_dir <- file.path(imdb_dir, "train")

labels <- c()
texts <- c()

for (label_type in c("neg", "pos")) {
  label <- switch(label_type, neg = 0, pos = 1)
  dir_name <- file.path(train_dir, label_type)
  for (fname in list.files(dir_name, pattern = glob2rx("*.txt"), 
                           full.names = TRUE)) {
    texts <- c(texts, readChar(fname, file.info(fname)$size))
    labels <- c(labels, label)
  }
}
```

Tokenize it
```{r}

maxlen <- 100                 # We will cut reviews after 100 words
training_samples <- 200       # We will be training on 200 samples
validation_samples <- 10000   # We will be validating on 10000 samples
max_words <- 10000            # We will only consider the top 10,000 words in the dataset

tokenizer <- text_tokenizer(num_words = max_words) %>% 
  fit_text_tokenizer(texts)

sequences <- texts_to_sequences(tokenizer, texts)

word_index = tokenizer$word_index
cat("Found", length(word_index), "unique tokens.\n")

data <- pad_sequences(sequences, maxlen = maxlen)

labels <- as.array(labels)
cat("Shape of data tensor:", dim(data), "\n")
cat('Shape of label tensor:', dim(labels), "\n")

# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices <- sample(1:nrow(data))
training_indices <- indices[1:training_samples]
validation_indices <- indices[(training_samples + 1): 
                              (training_samples + validation_samples)]

x_train <- data[training_indices,]
y_train <- labels[training_indices]

x_val <- data[validation_indices,]
y_val <- labels[validation_indices]
```

get the glove encoding

```{bash, eval=FALSE}
cd input
wget -nv http://nlp.stanford.edu/data/glove.6B.zip
```


```{bash, eval=FALSE}
cd input
unzip glove.6B.zip
rm glove.6B.zip
```

preprocess the glove encoding to turn words into numbers


```{r}
glove_dir = 'input'
lines <- readLines(file.path(glove_dir, "glove.6B.100d.txt"))
strsplit(lines[1:10], " ") # OK so each line is for a word, and gives the "loadings" for that word in each of 100 dimesnsions

embeddings_index <- new.env(hash = TRUE, parent = emptyenv())
for (i in 1:length(lines)) {
  line <- lines[[i]]
  values <- strsplit(line, " ")[[1]]
  word <- values[[1]] # the word, since this comes first
  embeddings_index[[word]] <- as.double(values[-1]) # the loadings
}

cat("Found", length(embeddings_index), "word vectors.\n")
```

```{r}
head(names(embeddings_index))
embeddings_index$house
```

now create an embedding_index that we can use in keras.  This takes each word in the imdb reviews (up to 10,000) and adds the embedding.

```{r}
embedding_dim <- 100

embedding_matrix <- array(0, c(max_words, embedding_dim))

for (word in names(word_index)) {
  index <- word_index[[word]]
  if (index < max_words) {
    embedding_vector <- embeddings_index[[word]]
    if (!is.null(embedding_vector))
      # Words not found in the embedding index will be all zeros.
      embedding_matrix[index+1,] <- embedding_vector
  }
}
```

```{r}
cat("word index\n")
head(word_index)
cat("embedding matrix\n")
dim(embedding_matrix)
head(embedding_matrix[,1:10])

cat("embeddings_index$the\n")
embeddings_index$the[1:10]
```

So the first data row (row2) in the embeddings matrix corresponds to the first word in the imdb word index and has the embeddings for that

### now define the model:

```{r}
model <- keras_model_sequential() %>% 
  layer_embedding(input_dim = max_words, output_dim = embedding_dim, 
                  input_length = maxlen) %>% 
  layer_flatten() %>% 
  layer_dense(units = 32, activation = "relu") %>% 
  layer_dense(units = 1, activation = "sigmoid")

summary(model)
```

add the embedding


```{r}
get_layer(model, index = 1) %>% 
  set_weights(list(embedding_matrix)) %>% 
  freeze_weights()
```


```{r, echo=TRUE, results='hide'}
model %>% compile(
  optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = c("acc")
)

history <- model %>% fit(
  x_train, y_train,
  epochs = 20,
  batch_size = 32,
  validation_data = list(x_val, y_val)
)

save_model_weights_hdf5(model, "pre_trained_glove_model.h5")
```

Let's plot its performance over time:

```{r}
plot(history)
```

without pretrained embeddings


```{r, echo=TRUE, results='hide'}
model <- keras_model_sequential() %>% 
  layer_embedding(input_dim = max_words, output_dim = embedding_dim, 
                  input_length = maxlen) %>% 
  layer_flatten() %>% 
  layer_dense(units = 32, activation = "relu") %>% 
  layer_dense(units = 1, activation = "sigmoid")

model %>% compile(
  optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = c("acc")
)

history <- model %>% fit(
  x_train, y_train,
  epochs = 20,
  batch_size = 32,
  validation_data = list(x_val, y_val)
)
```

```{r}
plot(history)
```
slightly worse

### evaluate test data

```{r}
test_dir <- file.path(imdb_dir, "test")

labels <- c()
texts <- c()

for (label_type in c("neg", "pos")) {
  label <- switch(label_type, neg = 0, pos = 1)
  dir_name <- file.path(test_dir, label_type)
  for (fname in list.files(dir_name, pattern = glob2rx("*.txt"), 
                           full.names = TRUE)) {
    texts <- c(texts, readChar(fname, file.info(fname)$size))
    labels <- c(labels, label)
  }
}

sequences <- texts_to_sequences(tokenizer, texts)
x_test <- pad_sequences(sequences, maxlen = maxlen)
y_test <- as.array(labels)
```

And let's load and evaluate the first model:

```{r}
model %>% 
  load_model_weights_hdf5("pre_trained_glove_model.h5") %>% 
  evaluate(x_test, y_test, verbose = 0)
```