Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU-Based Environment #361

Open
DavidAkinpelu opened this issue Jun 27, 2024 · 0 comments
Open

GPU-Based Environment #361

DavidAkinpelu opened this issue Jun 27, 2024 · 0 comments

Comments

@DavidAkinpelu
Copy link

DavidAkinpelu commented Jun 27, 2024

I am trying to create a GPU-based environment where a model is being trained say resnet18. I am following the the PointChasingEnv https://github.com/AI4Finance-Foundation/ElegantRL/blob/master/elegantrl/envs/PointChasingEnv.py. Does that implies that I will create several models (num_envs) and trainloader too? This is an example of what I have been able to come up with

class AdaptiveLearningRateOptimizer:
    """
    Optimization environment.
    Can be used to learn an adaptive learning rate schedule.

    Observations (6):
        0: Training loss
        1: Validation loss
        2: Variance of predictions
        3: Variance of prediction changes
        4: Mean of output weight matrix
        5: Variance of output weight matrix
        6: Learning rate

    Actions - Discrete (3):
        0: Increases the learning rate
        1: Decreases the learning rate
        2: No-op

    Actions - Continuous (1):
        0: Scaling factor for the learning rate
    """

    def __init__(self, dataset, architecture, batch_size, max_steps, initial_lr, prefix, 
                 num_envs=10, device_id=0, discrete=True, action_range=1.05, lr_noise=True):
        super().__init__()

        class SpecDummy():
            def __init__(self, id):
                self.id = id

        self.spec = SpecDummy(id='AdaptiveLearningRateContinuous-v0' if not discrete else 'AdaptiveLearningRate-v0')

        self.dataset = dataset
        self.architecture = architecture
        self.device = torch.device(f'cuda:{device_id}' if torch.cuda.is_available() else 'cpu')
        self.num_envs = num_envs
        self.train_dataset, self.val_dataset, self.test_dataset = load_dataset(dataset)
        self.net = [load_models(architecture, dataset)() for _ in range(self.num_envs)]
        self.optimizers = [torch.optim.Adam(net.parameters(), lr=initial_lr) for net in self.net]
        self.batch_size = batch_size
        self.max_steps = max_steps
        self.initial_lr = initial_lr
        self.ep_initial_lr = torch.tensor([initial_lr], dtype=torch.float32).repeat(self.num_envs, 1)
        self.discrete = discrete
        self.action_range = action_range
        self.last_network_predictions = torch.zeros(self.num_envs, 1)
        self.latest_end_val = torch.empty(self.num_envs, 1)
        self.reward_tolerance = 1e-7
        self.prev_reward = torch.zeros(self.num_envs, 1)
        self.reward_buffer = torch.zeros((self.num_envs, 0), dtype=torch.float32, device=self.device)
        self.episode_reward_buffer = torch.zeros((self.num_envs, 0), dtype=torch.float32, device=self.device)
        self.episode_mean_reward = torch.zeros(self.num_envs, 1)
        self.prefix = prefix
        self.test_generator = DataLoader(self.test_dataset, batch_size=batch_size, shuffle=True)
        self.lr_noise = lr_noise
        self.lr = torch.tensor([self.initial_lr], dtype=torch.float32).repeat(self.num_envs, 1)
        self.avg_train_loss = torch.zeros(self.num_envs, 1)
        self.avg_val_loss = torch.zeros(self.num_envs, 1)
        self.network_prediction_var = torch.zeros(self.num_envs, 1)
        self.network_prediction_change_var = torch.zeros(self.num_envs, 1)
        self.output_layer_weights_mean = torch.zeros(self.num_envs, 1)
        self.output_layer_weights_var = torch.zeros(self.num_envs, 1)

        self.current_step = torch.zeros(self.num_envs, 1)

        self.env_name = "AdaptiveLearningRateVecEnv"

        # Initialize the criterion (loss function)
        self.criterion = torch.nn.CrossEntropyLoss()
    
        self.train_generators = [DataLoader(self.train_dataset, batch_size=batch_size, shuffle=True) for _ in range(self.num_envs)]
        self.val_generators = [DataLoader(self.val_dataset, batch_size=batch_size, shuffle=True) for _ in range(self.num_envs)]
        self.test_generators = [DataLoader(self.test_dataset, batch_size=batch_size, shuffle=True) for _ in range(self.num_envs)]

        self.state = torch.zeros(self.num_envs, 7)
       
    def get_state(self) -> Tensor:
        return torch.cat([self.avg_train_loss, self.avg_val_loss, self.network_prediction_var, 
                          self.network_prediction_change_var, self.output_layer_weights_mean, 
                          self.output_layer_weights_var, self.lr], dim=1)

    def _clip_lr(self, idx):
        self.lr[idx] = torch.clamp(self.lr[idx], min=1e-6, max=1e-1)

    def _add_lr_noise(self, idx, std=None, clip=True):
        if std is None:
            std = 1e-5
        
        # Generate noise with the same shape as self.lr
        noise = torch.normal(mean=0, std=std, size=self.lr.shape, device=self.lr[idx].device)
        
        # Add noise to lr
        self.lr[idx] = self.lr[idx] + noise
        
        if clip:
            self._clip_lr(idx=idx)

    def _update_lr(self, action, idx, clip=True):
        # Ensure action is a tensor and get the first element if it's not a scalar
        if not isinstance(action, torch.Tensor):
            action = torch.tensor(action)
        if action.dim() > 0:
            action = action[0]

        # Update learning rate based on action
        if self.discrete:
            if action == 0:
                self.lr[idx] *= self.action_range
            elif action == 1:
                self.lr[idx]  /= self.action_range
        else:
            self.lr[idx]  *= action.float()

        # Only apply noise and scheduler step if it's not the first step
        if self.current_step != 0:
            # Add noise if enabled
            if self.lr_noise:
                self._add_lr_noise(clip=clip, idx=idx)

        # Clip lr if required
        if clip:
            self._clip_lr(idx)

        # apply the new learning rate to the optimizer
        self.optimizers[idx] = torch.optim.Adam(self.net[idx].parameters(), lr=self.lr[idx])
        
    def test(self, idx):
        self.net[idx].eval()  # Set the model to evaluation mode
        total_test_loss = 0.0
        total_correct = 0
        total_test_samples = 0
        with torch.no_grad():  # Disable gradient calculation
            for x, y in self.test_generators[idx]:
                x, y = x.to(self.device), y.to(self.device)
                outputs = self.net[idx](x)
                loss = F.cross_entropy(outputs, y)
                total_test_loss += loss.item() * x.size(0)
                # Get the predicted class by taking the argmax of the output
                _, predicted = torch.max(outputs, 1)
                total_correct += (predicted == y).sum().item()
                total_test_samples += x.size(0)
        avg_loss = total_test_loss / total_test_samples
        accuracy = total_correct / total_test_samples
        return avg_loss, accuracy

    def train_epoch(self, idx):
        self.net[idx].train()  # Ensure the network is in training mode
        total_train_loss = 0.0
        total_train_samples = 0
        for x, y in self.train_generators[idx]:
            x, y = x.to(self.device), y.to(self.device)
            self.optimizers[idx].zero_grad()
            outputs = self.net[idx](x)
            loss = F.cross_entropy(outputs, y)
            loss.backward()
            self.optimizers[idx].step()
            total_train_loss += loss.item() * x.size(0)
            total_train_samples += x.size(0)
        avg_train_loss = total_train_loss / total_train_samples
        return avg_train_loss

    def eval_epoch(self, idx):
        self.net[idx].eval()  # Ensure the network is in evaluation mode
        running_loss = 0.0
        network_predictions = []
        network_prediction_var = 0
        with torch.no_grad():
            for inputs, targets in self.val_generators[idx]:
                inputs, targets = inputs.to(self.device), targets.to(self.device)
                outputs = self.net[idx](inputs)
                network_prediction_var += outputs.var().cpu()
                network_predictions.append(outputs)
                loss = self.criterion(outputs, targets)
                running_loss += loss.item()
        avg_val_loss = running_loss / len(self.val_generators[idx])
        return avg_val_loss, network_predictions, network_prediction_var
    
    def reset_idx(self, idx, seed=None):
        #setproctitle(f'PPO-ALRS-v0-{idx}')
        
        self.train_generators[idx] = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
        self.val_generators[idx] = DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=True)

        
        self.last_network_predictions[idx] = []
        self.current_step[idx] = torch.tensor(0, device=self.device)
        self.prev_reward[idx] = torch.tensor(0.0, device=self.device)
        self.reward_buffer[idx] = torch.zeros(0, device=self.device)
        self.episode_reward_buffer[idx] = torch.zeros(0, device=self.device)
        
        if self.initial_lr is None:
            self.lr[idx] = torch.tensor(float(np.random.choice([1e-2, 1e-3, 1e-4])), device=self.device)
        else:
            self.lr[idx] = torch.tensor(self.initial_lr, device=self.device)
        
        self.optimizers[idx] = torch.optim.Adam(self.net[idx].parameters(), lr=self.lr[idx].item())
        
        avg_train_loss = self.train_epoch(idx)
        avg_val_loss, network_predictions, network_prediction_var = self.eval_epoch(idx)

        # Calculate network prediction change variance
        network_prediction_change_var = []
        for idx in range(self.num_envs):
            env_change_vars = []
            for i, pred in enumerate(network_predictions[idx]):
                last_pred = self.last_network_predictions[idx][i] if len(self.last_network_predictions[idx]) > i else torch.zeros_like(pred)
                env_change_vars.append((pred - last_pred).var())
            network_prediction_change_var.append(torch.stack(env_change_vars).mean())
        self.network_prediction_change_var = torch.stack(network_prediction_change_var).unsqueeze(1)
        
        output_layer_weights = list(self.net[idx].parameters())[-2]
        reward = -avg_val_loss
        
        self.reward_buffer[idx] = torch.tensor([reward], device=self.device)
        self.episode_reward_buffer[idx] = torch.tensor([reward], device=self.device)
        
        self.avg_train_loss[idx] = avg_train_loss
        self.avg_val_loss[idx] = avg_val_loss
        self.network_prediction_var[idx] = network_prediction_var
        self.network_prediction_change_var[idx] = network_prediction_change_var
        self.output_layer_weights_mean[idx] = output_layer_weights.mean()
        self.output_layer_weights_var[idx] = output_layer_weights.var()
        
        self.state[idx] = torch.tensor([
            avg_train_loss,
            avg_val_loss,
            network_prediction_var,
            network_prediction_change_var,
            output_layer_weights.mean().item(),
            output_layer_weights.var().item(),
            self.lr[idx].item()
        ], dtype=torch.float32, device=self.device)
        
        return self.state[idx], {}

    def step(self, actions):
        for idx in range(self.num_envs):
            self._update_lr(actions[idx], idx)
        
        # Training
        self.avg_train_loss = torch.stack([self.train_epoch(i) for i in range(self.num_envs)]).unsqueeze(1)
        
        # Evaluation
        eval_results = [self.eval_epoch(i) for i in range(self.num_envs)]
        self.avg_val_loss = torch.stack([res[0] for res in eval_results]).unsqueeze(1)
        network_predictions = [res[1] for res in eval_results]
        self.network_prediction_var = torch.stack([res[2] for res in eval_results]).unsqueeze(1)

        # Calculate network prediction change variance
        network_prediction_change_var = []
        for idx in range(self.num_envs):
            env_change_vars = []
            for i, pred in enumerate(network_predictions[idx]):
                last_pred = self.last_network_predictions[idx][i] if self.last_network_predictions[idx] and i < len(self.last_network_predictions[idx]) else torch.zeros_like(pred)
                env_change_vars.append((pred - last_pred).var())
            network_prediction_change_var.append(torch.stack(env_change_vars).mean())
        self.network_prediction_change_var = torch.stack(network_prediction_change_var).unsqueeze(1)
        
        self.last_network_predictions = network_predictions

        # Update output layer weights statistics
        output_layer_weights = torch.stack([list(self.net[i].parameters())[-2] for i in range(self.num_envs)])
        self.output_layer_weights_mean = output_layer_weights.mean(dim=[1, 2]).unsqueeze(1)
        self.output_layer_weights_var = output_layer_weights.var(dim=[1, 2]).unsqueeze(1)

        rewards = -self.avg_val_loss

        # Update reward buffers
        self.reward_buffer = torch.cat([self.reward_buffer, rewards], dim=1)
        if self.reward_buffer.shape[1] > 5:
            self.reward_buffer = self.reward_buffer[:, -5:]
        self.episode_reward_buffer = torch.cat([self.episode_reward_buffer, rewards], dim=1)

        self.state = torch.cat([
            self.avg_train_loss,
            self.avg_val_loss,
            self.network_prediction_var,
            self.network_prediction_change_var,
            self.output_layer_weights_mean,
            self.output_layer_weights_var,
            self.lr.unsqueeze(1)
        ], dim=1)

        self.current_step += 1  # This now increments all elements of the tensor

        info = [{
            'train_loss': self.avg_train_loss[i].item(),
            'val_loss': self.avg_val_loss[i].item(),
            'lr': self.lr[i].item()
        } for i in range(self.num_envs)]

        truncated = (self.current_step >= self.max_steps).unsqueeze(1)
        terminated = (self.reward_buffer.max(dim=1)[0] - self.reward_buffer.min(dim=1)[0] < self.reward_tolerance).unsqueeze(1)

        if self.prefix != 'train_agent':
            wandb.log({
                f'{self.prefix}_learning_rate': self.lr.mean().item(),
                f'{self.prefix}_final_val_loss': self.avg_val_loss.mean().item(),
                f'{self.prefix}_final_train_loss': self.avg_train_loss.mean().item(),
                f'action': actions.mean().item()
            })

        self.prev_reward = rewards

        episode_ended = truncated | terminated
        for i in range(self.num_envs):
            if episode_ended[i]:
                self.episode_mean_reward[i] = self.episode_reward_buffer[i].mean()
                wandb.log({f'{self.prefix}_episode_mean_reward_{i}': self.episode_mean_reward[i].item()})
                # Reset episode-specific data for this environment
                self.episode_reward_buffer[i] = torch.zeros(0, device=self.device)

        return self.state, rewards, episode_ended, info
        
    def reset(self):
        for idx in range(self.num_envs):
            self.reset_idx(idx)
        return self.get_state()
    
    def render(self, mode='human'):
        pass


def check_chasing_vec_env():
    env = AdaptiveLearningRateOptimizer(max_steps=200, dataset='cifar10', batch_size=256, initial_lr=0.001, 
                                        prefix='PPO_LR', discrete=False, num_envs=4, architecture='resnet18')
    

    reward_sums = [
        0,
    ] * env.num_envs

    reward_sums_list = [
        [],
    ] * env.num_envs
    
    states = env.reset()
    for _ in range(env.max_steps * 4):
        actions = env.get_action(states)
        states, rewards, dones, _ = env.step(actions)

        for env_i in range(env.num_envs):
            reward_sums[env_i] += rewards[env_i]
            if dones[env_i]:
                print(
                f"{env.distances[env_i].item():8.4f}    {actions[env_i].detach().cpu().numpy().round(2)}"
            )
            reward_sums_list[env_i].append(reward_sums[env_i])
            reward_sums[env_i] = 0.0

    reward_sums_list = np.array(reward_sums_list)
    print("shape:", reward_sums_list.shape)
    print("mean: ", np.mean(reward_sums_list, axis=1))
    print("std:  ", np.std(reward_sums_list, axis=1))

if __name__ == '__main__':
    print("Running test")
    check_chasing_vec_env()

Any suggestion?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant