open plants.Rmd

---
title: "open plants"
author: "Julin Maloof"
date: "10/26/2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
reticulate::conda_install("pandas", envname = "r-reticulate")
```


```{r}
library(tidyverse)
library(jsonlite)
library(keras)
use_condaenv("r-reticulate")
```


```{r}
info <- jsonlite::fromJSON("open_plants/annotations_all.json")
```

```{r}
head(info)
dim(info)
```

For each species and each growth condition do an 80/10/10 split

First, how many images in of each class?

```{r}
info %>% group_by(eppo, growth_condition) %>%
  summarize(count=n())
```
want to eliminate species with low #s

```{r}
info %>% group_by(eppo, growth_condition) %>%
  summarize(count=n()) %>%
  group_by(eppo) %>%
  summarize(min_count=min(count)) %>%
  arrange(min_count)

info %>% group_by(eppo, growth_condition) %>%
  summarize(count=n()) %>%
  group_by(eppo) %>%
  summarize(min_count=min(count)) %>%
  filter(min_count > 99) %>%
  arrange(eppo)

keep <- info %>% group_by(eppo, growth_condition) %>%
  summarize(count=n()) %>%
  group_by(eppo) %>%
  summarize(min_count=min(count)) %>%
  filter(min_count > 99) %>%
  filter(row_number()<=10) %>%
  pull(eppo)

keep
```
```{r}
getwd()
```

```{r}
info.sm <- info %>%
  filter(eppo %in% keep[1:6]) %>%
  group_by(eppo, growth_condition) %>%
  mutate(group=cut(percent_rank(row_number()),
                   breaks=c(-1, 0.8, 0.9, 1),
                   labels=c("train", "validate", "test"))) %>%
  ungroup() %>%
  mutate(file.path=file.path(getwd(), 
                             "open_plants", 
                             "images", 
                             eppo, 
                             filename)) %>% 
  mutate(eppo1 = as.integer(as.factor(eppo)) -1,
         gc1 = as.integer(as.factor(growth_condition))-1)


head(info.sm)
```

Download data
```{r}
for(k in keep) {
  url <- str_c("https://gitlab.au.dk/AUENG-Vision/OPPD/-/archive/master/OPPD-master.zip?path=DATA/images_plants/",k)
  destfile <- file.path("open_plants", "images", str_c(k,".zip"))
  if((!file.exists(destfile) & !dir.exists(file.path("open_plants", "images", k)))  ){
    print(k)
    curl::curl_download(url, destfile, quiet=FALSE )
  }
}
```


```{r}
zip_files <- dir(file.path("open_plants", "images"), pattern=".zip", full.names = TRUE)
for (z in zip_files) {
  if(!dir.exists(file.path("open_plants", 
                           "images", 
                           str_remove(basename(z),".zip")))) {
    print(z)
    unzip(z, exdir = file.path("open_plants", 
                               "images", 
                               str_remove(basename(z),".zip")), 
          junkpaths = TRUE)
  }}
```


```{r}
train_gen <- flow_images_from_dataframe(
  {info.sm %>% filter(group=="train") %>% as.data.frame()},
  directory = NULL,
  x_col="file.path",
  class_mode = "multi_output",
  target_size = c(128,128),
  generator = image_data_generator(rescale = 1/255),
  y_col=list("eppo1", "gc1") # Can't be named list, apparently
)

test_gen <- flow_images_from_dataframe(
  {info.sm %>% filter(group=="test") %>% as.data.frame()},
  directory = NULL,
  x_col="file.path",
  class_mode = "multi_output",
  target_size = c(128,128),
  generator = image_data_generator(rescale = 1/255),
  y_col=list("eppo1", "gc1") # Can't be named list, apparently
)


val_gen <- flow_images_from_dataframe(
  {info.sm %>% filter(group=="validate") %>% as.data.frame()},
  directory = NULL,
  x_col="file.path",
  class_mode = "multi_output",
  target_size = c(128,128),
  generator = image_data_generator(rescale = 1/255),
  y_col=list("eppo1", "gc1") # Can't be named list, apparently
)

test <- generator_next(val_gen)
str(test)
```