''''''
""" 本文件只为用来生成中文文档,请不要另作它用 """
"""
POLIXIR REVIVE, copyright (C) 2021-2023 Polixir Technologies Co., Ltd., is
distributed under the GNU Lesser General Public License (GNU LGPL).
POLIXIR REVIVE is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
"""
import os
import sys
import json
import uuid
import socket
import pickle
import warnings
from copy import deepcopy
from typing import Dict, Union, Optional, Tuple
import ray
import numpy as np
from loguru import logger
from revive.utils.common_utils import get_reward_fn, list2parser, setup_seed
from revive.computation.inference import PolicyModel, VirtualEnv, VirtualEnvDev
from revive.conf.config import DEBUG_CONFIG, DEFAULT_CONFIG
from revive.data.dataset import OfflineDataset
from revive.utils.server_utils import DataBufferEnv, DataBufferPolicy, DataBufferTuner, Logger, VenvTrain, TuneVenvTrain, PolicyTrain, TunePolicyTrain, ParameterTuner
warnings.filterwarnings('ignore')
[docs]class ReviveServer:
r"""
ReviveServer是Revive SDK的训练入口,负责启动并管理所有训练任务。
`ReviveServer` 执行四个步骤来完成初始化:
1. 创建或连接到ray集群。集群地址由`address`参数控制。如果`address`参数为`None`,它将创建自己的集群。
如果指定了`address`参数,它将使用参数连接到现有集群。
2. 加载培训配置文件。提供的默认配置文件为`config.json`中。可以通过编辑文件来更改默认参数。
3. 加载决策流图,npz数据和函数。数据文件由参数`dataset_file_path `、`dataset_desc_file_path和`val_file_path'参数指定。
4. 创建日志文件夹存储训练结果。这些日志的顶层文件夹由`log_dir`参数控制。如果未提供,则默认生成的`logs`文件夹。
第二级文件夹由训练配置中的`run_id`参数控制,如果未指定,将为文件夹生成一个随机id。所有训练日志和模型都将放在第二级文件夹中。
参数:
:dataset_file_path (str):
训练数据的文件路径( ``.npz`` 或 ``.h5`` 文件)。
:dataset_desc_file_path (str):
决策流图的文件路径( ``.yaml`` )。
:val_file_path (str):
验证数据的文件路径(可选)。
:reward_file_path (str):
定义奖励函数的文件的存储路径。
:target_policy_name (str):
要优化的策略节点的名称。如果为None,则将选择决策流图中的第一网络节点作为策略节点。
:log_dir (str):
模型和训练日志存储文件夹
:run_id (str):
实验ID,用于生成日志文件夹名称,区分不同的实验。如果未提供,系统会自动生成。
:address (str):
ray集群地址,集群地址由`address`参数控制。如果`address`参数为`None`,它将创建自己的集群。如果指定了`address`参数,它将使用参数连接到现有集群。
:venv_mode ("tune","once","None"):
训练虚拟环境的不同模式:
`tune` 使用超参数搜索来训练虚拟环境模型,需要消耗大量的算力和时间,以搜寻超参数来获得更优的模型结果。
`once` 使用默认参数训练虚拟环境模型。
`None` 不训练虚拟环境模型。
:policy_mode ("tune","once","None"):
策略模型的训练模式:
`tune` 使用超参数搜索来训练策略模型,需要消耗大量的算力和时间,以搜寻超参数来获得更优的模型结果。
`once` 使用默认参数训练策略模型。
`None` 不训练策略模型。
:custom_config:
超参配置文件路径,可用于覆盖默认参数。
:kwargs:
关键字参数,可用于覆盖默认参数。
"""
def __init__(self,
dataset_file_path : str,
dataset_desc_file_path : str,
val_file_path : Optional[str] = None,
reward_file_path : Optional[str] = None,
target_policy_name : str = None,
log_dir : str = None,
run_id : Optional[str] = None,
address : Optional[str] = None,
venv_mode : str = 'tune',
policy_mode : str = 'tune',
tuning_mode : str = 'None',
tune_initial_state : Optional[Dict[str, np.ndarray]] = None,
debug : bool = False,
revive_config_file_path : Optional[str] = None,
**kwargs):
assert policy_mode == 'None' or tuning_mode == 'None', 'Cannot perform both policy training and parameter tuning!'
# ray.init(local_mode=True) # debug only
''' get config '''
config = DEBUG_CONFIG if debug else DEFAULT_CONFIG
parser = list2parser(config)
self.config = parser.parse_known_args()[0].__dict__
self.run_id = run_id or uuid.uuid4().hex
self.workspace = os.path.abspath(os.path.join(log_dir, self.run_id))
self.config['workspace'] = self.workspace
os.makedirs(self.workspace, mode=0o777, exist_ok=True)
assert os.path.exists(self.workspace)
self.revive_log_path = os.path.join(os.path.abspath(self.workspace),"revive.log")
self.config["revive_log_path"] = self.revive_log_path
logger.add(self.revive_log_path)
if revive_config_file_path is not None:
with open(revive_config_file_path, 'r') as f:
custom_config = json.load(f)
self.config.update(custom_config)
for parameter_description in custom_config.get('base_config', {}):
self.config[parameter_description['name']] = parameter_description['default']
revive_config_save_file_path = os.path.join(self.workspace, "config.json")
with open(revive_config_save_file_path, 'w') as f:
json.dump(self.config,f)
self.revive_config_file_path = revive_config_save_file_path
''' preprocess config'''
# NOTE: in crypto mode, each trail is fixed to use one GPU.
self.config['is_crypto'] = os.environ.get('REVIVE_CRYPTO', 0)
setup_seed(self.config['global_seed'])
self.venv_mode = venv_mode
self.policy_mode = policy_mode
self.tuning_mode = tuning_mode
self.tune_initial_state = tune_initial_state
self.reward_func = get_reward_fn(reward_file_path, dataset_desc_file_path)
self.config['user_func'] = self.reward_func
''' create dataset '''
self.data_file = dataset_file_path
self.config_file = dataset_desc_file_path
self.val_file = val_file_path
self.dataset = OfflineDataset(self.data_file, self.config_file, self.config['ignore_check'])
self._check_license()
self.runtime_env = {"env_vars": {"PYTHONPATH":os.pathsep.join(sys.path), "PYARMOR_LICENSE": sys.PYARMOR_LICENSE}}
ray.init(address=address, runtime_env=self.runtime_env)
if self.val_file:
self.val_dataset = OfflineDataset(self.val_file, self.config_file, self.config['ignore_check'])
self.val_dataset.processor = self.dataset.processor # make sure dataprocessing is the same
self.config['val_dataset'] = ray.put(self.val_dataset)
else: # split the training set if validation set is not provided
self.dataset, self.val_dataset = self.dataset.split(self.config['val_split_ratio'], self.config['val_split_mode'])
self.config['val_dataset'] = ray.put(self.val_dataset)
self.config['dataset'] = ray.put(self.dataset)
self.config['graph'] = self.dataset.graph
self.graph = self.config['graph']
if not tuning_mode == 'None': assert len(self.dataset.graph.tunable) > 0, 'No tunable parameter detected, please check the config yaml!'
self.config['learning_nodes_num'] = self.dataset.learning_nodes_num
if target_policy_name is None:
target_policy_name = list(self.config['graph'].keys())[0]
logger.warning(f"Target policy name [{target_policy_name}] is chosen as default")
self.config['target_policy_name'] = target_policy_name
''' save a copy of the base graph '''
with open(os.path.join(self.workspace, 'graph.pkl'), 'wb') as f:
pickle.dump(self.config['graph'], f)
''' setup data buffers '''
self.driver_ip = socket.gethostbyname(socket.gethostname())
self.venv_data_buffer = ray.remote(DataBufferEnv).options(resources={}).remote(venv_max_num=self.config['num_venv_store'])
self.policy_data_buffer = ray.remote(DataBufferPolicy).options(resources={}).remote()
self.tuner_data_buffer = ray.remote(DataBufferTuner).options(resources={}).remote(self.tuning_mode, self.config['parameter_tuning_budget'])
self.config['venv_data_buffer'] = self.venv_data_buffer
self.config['policy_data_buffer'] = self.policy_data_buffer
self.config['tuner_data_buffer'] = self.tuner_data_buffer
''' try to load existing venv and policy '''
self.env_save_path = kwargs.get("env_save_path", None)
self.policy_save_path = kwargs.get("policy_save_path", None)
#self._reload_venv(os.path.join(self.workspace, 'env.pkl'))
#self._reload_policy(os.path.join(self.workspace, 'policy.pkl'))
self.venv_acc = - float('inf')
self.policy_acc = - float('inf')
self.venv_logger = None
self.policy_logger = None
self.tuner_logger = None
data = {"REVIVE_STOP" : False, "LOG_DIR":os.path.join(os.path.abspath(self.workspace),"revive.log")}
with open(os.path.join(self.workspace, ".env.json"), 'w') as f:
json.dump(data, f)
def _reload_venv(self, path: str, return_graph: bool = False):
r'''从给定路径重新加载venv'''
try:
with open(path, 'rb') as f:
self.venv = pickle.load(f)
self.venv.check_version()
if not self.graph.is_equal_venv(self.venv.graph,self.config['target_policy_name']):
logger.error('Detect different graph between loaded venv and data config, it is mostly cased by change of config file, trying to rebuild ...')
logger.error('Please check if there are some changes between config files of learing Environment and Policy!')
sys.exit()
if not self.graph.is_equal_structure(self.venv.graph):
logger.warning('graph.is_equal_structure Detect different graph between loaded venv and data config, it is mostly cased by change of config file, trying to rebuild ...')
venv_list = []
for _venv in self.venv.env_list:
graph = deepcopy(self.graph)
graph.copy_graph_node(_venv.graph)
venv_list.append(VirtualEnvDev(graph))
if return_graph:
return graph
self.venv = VirtualEnv(venv_list)
ray.get(self.venv_data_buffer.set_best_venv.remote(self.venv))
except Exception as e:
logger.info(f"Don't load venv -> {e}")
self.venv = None
def _reload_policy(self, path : str):
r'''从给定路径重新加载策略'''
try:
with open(path, 'rb') as f:
self.policy = pickle.load(f)
self.policy.check_version()
ray.get(self.policy_data_buffer.set_best_policy.remote(self.policy))
except Exception as e:
logger.info(f"Don't load policy -> {e}")
self.policy = None
[docs] def train(self, env_save_path : Optional[str] = None):
r"""
训练虚拟环境和策略
步骤:
1. 加载数据和参数配置启动ray actor训练虚拟环境;
2. 加载数据,参数和已训练完成的虚拟环境启动ray actor训练策略。
"""
self.train_venv()
self.train_policy(env_save_path)
self.tune_parameter(env_save_path)
[docs] def train_venv(self):
r"""
加载数据和参数配置启动ray actor训练虚拟环境
"""
if self.env_save_path and os.path.exists(self.env_save_path):
graph = self._reload_venv(self.env_save_path, return_graph=True)
self.config['graph'] = graph
self.graph = graph
self.venv_logger = ray.remote(Logger).remote()
self.venv_logger.update.remote(key="task_state", value="Wait")
if self.venv_mode == 'None':
self.venv_logger.update.remote(key="task_state", value="End")
else:
if 'wdist' in self.config['venv_metric']:
self.config['max_distance'] = 2
self.config['min_distance'] = 0
elif 'mae' in self.config['venv_metric']:
self.config['max_distance'] = np.log(2)
self.config['min_distance'] = np.log(2) - 15
elif 'mse' in self.config['venv_metric']:
self.config['max_distance'] = np.log(4)
self.config['min_distance'] = np.log(4) - 15
elif 'nll' in self.config['venv_metric']:
self.config['max_distance'] = 0.5 * np.log(2 * np.pi)
self.config['min_distance'] = 0.5 * np.log(2 * np.pi) - 10
logger.info(f"Distance is between {self.config['min_distance']} and {self.config['max_distance']}")
if self.config["venv_algo"] == "revive":
self.config["venv_algo"] = "revive_p"
logger.remove()
if self.venv_mode == 'once':
venv_trainer = ray.remote(VenvTrain).remote(self.config, self.venv_logger, command=sys.argv[1:])
venv_trainer.train.remote()
# NOTE: after task finish, the actor will be automatically killed by ray, since there is no reference to it
elif self.venv_mode == 'tune':
self.venv_trainer = ray.remote(TuneVenvTrain).remote(self.config, self.venv_logger, command=sys.argv[1:])
self.venv_trainer.train.remote()
logger.add(self.revive_log_path)
[docs] def train_policy(self, env_save_path : Optional[str] = None):
r"""
加载数据,参数和已训练完成的虚拟环境启动ray actor训练策略.
参数:
:env_save_path: 虚拟环境的保存地址,默认为None,将会自动根据run_id查找虚拟环境文件
.. note:: 在训练策略之前,应提供已训练完成的虚拟环境模型和奖励函数。
"""
if not env_save_path:
env_save_path = os.path.join(self.workspace, 'env.pkl')
self._reload_venv(env_save_path)
if self.venv is None:
logger.warning(f"Can't load the exist env model.")
self.policy_logger = ray.remote(Logger).remote()
self.policy_logger.update.remote(key="task_state", value="Wait")
logger.remove()
if self.policy_mode == 'None':
self.policy_logger.update.remote(key="task_state", value="End")
elif self.policy_mode == 'once':
assert self.reward_func is not None, 'policy training need reward function'
policy_trainer = ray.remote(PolicyTrain).remote(self.config, self.policy_logger, self.venv_logger, command=sys.argv[1:])
policy_trainer.train.remote()
# NOTE: after task finish, the actor will be automatically killed by ray, since there is no reference to it
elif self.policy_mode == 'tune':
assert self.reward_func is not None, 'policy training need reward function'
self.policy_trainer = ray.remote(TunePolicyTrain).remote(self.config, self.policy_logger, self.venv_logger, command=sys.argv[1:])
self.policy_trainer.train.remote()
logger.add(self.revive_log_path)
[docs] def tune_parameter(self, env_save_path : Optional[str] = None):
if env_save_path is not None:
self._reload_venv(env_save_path)
self.config['user_func'] = self.reward_func
self.tuner_logger = ray.remote(Logger).remote()
self.tuner_logger.update.remote(key="task_state", value="Wait")
if self.tuning_mode == 'None':
self.tuner_logger.update.remote(key="task_state", value="End")
else:
assert self.reward_func is not None, 'tuning parameter needs reward function'
self.tuner = ray.remote(ParameterTuner).remote(self.config, self.tuning_mode, self.tune_initial_state, self.tuner_logger, self.venv_logger)
self.tuner.run.remote()
[docs] def stop_train(self) -> None:
r"""停止所有训练任务
"""
_data = {"REVIVE_STOP" : True}
with open(os.path.join(self.workspace, ".env.json"), 'w') as f:
json.dump(_data, f)
if self.venv_logger is not None:
venv_logger = self.venv_logger.get_log.remote()
venv_logger = ray.get(venv_logger)
if venv_logger["task_state"] != "End":
self.venv_logger.update.remote(key="task_state", value="Shutdown")
if self.policy_logger is not None:
policy_logger = self.policy_logger.get_log.remote()
policy_logger = ray.get(policy_logger)
if policy_logger["task_state"] != "End":
self.policy_logger.update.remote(key="task_state", value="Shutdown")
[docs] def get_virtualenv_env(self) -> Tuple[VirtualEnv, Dict[str, Union[str, float]], Dict[int, Tuple[str, str]]]:
r"""获取实时最佳虚拟环境模型和训练日志
"""
assert self.dataset is not None
train_log = {}
if self.venv_logger is not None:
try:
venv_logger = self.venv_logger.get_log.remote()
venv_logger = ray.get(venv_logger)
train_log.update({"task_state": venv_logger["task_state"],})
except AttributeError:
train_log.update({"task_state": "Shutdown"})
metric = ray.get(self.venv_data_buffer.get_dict.remote())
venv_acc = float(metric["max_acc"])
current_num_of_trials = int(metric["num_of_trial"])
total_num_of_trials = int(metric["total_num_of_trials"])
train_log.update({
"venv_acc" : venv_acc,
"current_num_of_trials" : current_num_of_trials,
"total_num_of_trials" : total_num_of_trials,
})
self.venv_acc = max(self.venv_acc, venv_acc)
self.venv = ray.get(self.venv_data_buffer.get_best_venv.remote())
best_model_workspace = ray.get(self.venv_data_buffer.get_best_model_workspace.remote())
if self.venv is not None:
with open(os.path.join(self.workspace, 'env.pkl'), 'wb') as f:
pickle.dump(self.venv, f)
try:
self.venv.export2onnx(os.path.join(self.workspace, 'env.onnx'), verbose=False)
except Exception as e:
pass
logger.info(f"Can't to export venv to ONNX. -> {e}")
status_message = ray.get(self.venv_data_buffer.get_status.remote())
return self.venv, train_log, status_message, best_model_workspace
[docs] def get_policy_model(self) -> Tuple[PolicyModel, Dict[str, Union[str, float]], Dict[int, Tuple[str, str]]]:
r"""获取实时最佳策略模型和训练日志
"""
assert self.dataset is not None
train_log = {}
if self.policy_logger is not None:
try:
policy_logger = self.policy_logger.get_log.remote()
policy_logger = ray.get(policy_logger)
train_log.update({"task_state": policy_logger["task_state"],})
except AttributeError:
train_log.update({"task_state": "Shutdown"})
metric = ray.get(self.policy_data_buffer.get_dict.remote())
policy_acc = float(metric["max_reward"])
current_num_of_trials = int(metric["num_of_trial"])
total_num_of_trials = int(metric["total_num_of_trials"])
train_log.update({
"policy_acc" : policy_acc,
"current_num_of_trials" : current_num_of_trials,
"total_num_of_trials" : total_num_of_trials,
})
self.policy_acc = max(self.policy_acc, policy_acc)
self.policy = ray.get(self.policy_data_buffer.get_best_policy.remote())
best_model_workspace = ray.get(self.policy_data_buffer.get_best_model_workspace.remote())
if self.policy is not None:
with open(os.path.join(self.workspace, 'policy.pkl'), 'wb') as f:
pickle.dump(self.policy, f)
try:
tmp_policy = deepcopy(self.policy)
tmp_policy.reset()
tmp_policy.export2onnx(os.path.join(self.workspace, 'policy.onnx'), verbose=False)
except Exception as e:
logger.info(f"Can't to export venv to ONNX. -> {e}")
status_message = ray.get(self.policy_data_buffer.get_status.remote())
return self.policy, train_log, status_message, best_model_workspace
[docs] def get_parameter(self) -> Tuple[np.ndarray, Dict[str, Union[str, float]]]:
train_log = {}
if self.tuner_logger is not None:
try:
tuner_logger = self.tuner_logger.get_log.remote()
tuner_logger = ray.get(tuner_logger)
train_log.update({"task_state": tuner_logger["task_state"],})
except AttributeError:
train_log.update({"task_state": "Shutdown"})
metric = ray.get(self.tuner_data_buffer.get_state.remote())
train_log.update(metric)
self.best_parameter = train_log.pop('best_parameter')
return self.best_parameter, train_log
def _check_license(self):
from revive.utils.auth_utils import check_license
check_license(self)