🏇Trainer & Predictor

Overview

With Ray Remote Task and Jobs, we can easily access the distributed computing capabilities of Ray framework to boost our workloads. When performing model training, Ray native trainers are optimized even further. Thus, the Fiat Copilot trainer module provides utilities to help users define Ray Native trainers.

XGBoost Utils

from ray.air import Result
from ray.data import Preprocessor

def define_xgb_trainer_and_fit(
        dataset: dict,        # {"train": ..., "test": ...}
        label_col: str,
        scaling_conf: dict,
        boost_round: int,
        training_params: dict,
        preprocess_pipeline: Preprocessor,
) -> Result

You can use the util function define_xgb_trainer_and_fit to get a Ray trained Result object. It has the following attributes -

# metrics dict
result.metrics

# metrics Pandas dataframe
result.metrics_dataframe

# Ray AIR checkpoint
result.checkpoint

You can also directly convert a pre-trained Booster model into XGBoostCheckpoint -

def convert_xgboost_model_into_checkpoint(
        xgb_model=None,
        path: str | None = None # Path to load
) -> XGBoostCheckpoint

With checkpoints, you can easily define your Predictor object -

def compose_xgboost_batch_predictor(
        checkpoint: XGBoostCheckpoint
) -> BatchPredictor

Here is a complete example -

XGBoost Example

import json

import pandas as pd
import ray
import xgboost as xgb
from ray.data import Dataset, DatasetPipeline
from ray.data.preprocessors import StandardScaler
from rich import print
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from fiat_copilot.trainer.xgboost_util import define_xgb_trainer_and_fit, \
    convert_xgboost_model_into_checkpoint, compose_xgboost_batch_predictor
from fiat_copilot.workflow.annotations import as_asset, form_definitions
from fiat_copilot.workflow.storage import get_io_manager, PersistStorage, DataFormat


ray.init(
    address="auto",
    runtime_env={
        "pip": [
            "xgboost",
            "xgboost_ray",
            "rich"
        ]
    }
)


@as_asset(
    name="load_breast_cancer_dataset",
    description="Loading breast cancer dataset."
)
def load_breast_cancer_dataset():
    print("Loading dataset...")
    dataset = ray.data.read_parquet(f"datasets/breast_cancer")
    train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)
    test_dataset = valid_dataset.drop_columns(["target"])

    return {
        "train": train_dataset.to_pandas().to_json(),
        "test": test_dataset.to_pandas().to_json()
    }


@as_asset(
    name="train_xgboost_model_with_ray",
    description="Form XGBoost trainer and train it."
)
def train_xgboost_model_with_ray(load_breast_cancer_dataset):
    print("Form XGBoost trainer and training")

    dataset = load_breast_cancer_dataset
    df = pd.read_json(dataset['train'])
    train_dataset = ray.data.from_pandas(df)

    result = define_xgb_trainer_and_fit(
        dataset={
            "train": train_dataset
        },
        label_col="target",
        scaling_conf={
            "num_workers": 2,
            "use_gpu": False
        },
        boost_round=20,
        training_params={
            "objective": "binary:logistic",
            "eval_metric": ["logloss", "error"],
        },
        preprocess_pipeline=StandardScaler(
            columns=["mean radius", "mean texture"]
        )
    )

    with open("datasets/xgboost_test/result.json", "w+", encoding='utf-8') as file_obj:
        json.dump(result.metrics, file_obj)

    return result.checkpoint


@as_asset(
    name="train_vanilla_xgboost_model",
    description="Train native XGBoost Classifier"
)
def train_vanilla_xgboost_model():
    # Dataloading from sklearn
    cancer = load_breast_cancer()
    X = cancer.data
    y = cancer.target

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.3,
        random_state=1898
    )
    model = xgb.XGBClassifier().fit(X_train, y_train)

    return model


@as_asset(
    name="form_predictor_with_checkpoint",
    description="Test different predictor.",
    io_manager_key="json_io_manager"
)
def form_predictor_with_checkpoint(
        load_breast_cancer_dataset,
        train_xgboost_model_with_ray,
        train_vanilla_xgboost_model
):
    dataset = load_breast_cancer_dataset
    df = pd.read_json(dataset['test'])
    test_dataset = ray.data.from_pandas(df)

    # Ray predictor
    ray_ckpt = train_xgboost_model_with_ray
    ray_xgb_predictor = compose_xgboost_batch_predictor(ray_ckpt)

    # Vanilla predictor
    vanilla_model = train_vanilla_xgboost_model
    vanilla_ckpt = convert_xgboost_model_into_checkpoint(xgb_model=vanilla_model)
    vanilla_xgb_predictor = compose_xgboost_batch_predictor(vanilla_ckpt)

    # Predicting
    print("Start predicting...")
    ray_predicted_prob: Dataset | DatasetPipeline = ray_xgb_predictor.predict(test_dataset)
    print("Done prediction! Results:")
    ray_predicted_prob.show()
    print("Finished")

    print("Start predicting...")
    vanilla_predicted_prob: Dataset | DatasetPipeline = vanilla_xgb_predictor.predict(test_dataset)
    print("Done prediction! Results:")
    vanilla_predicted_prob.show()
    print("Finished")

    # Form output
    result = {
        "ray_predicted_results": ray_predicted_prob.to_pandas().to_dict(),
        "vanilla_predicted_results": vanilla_predicted_prob.to_pandas().to_dict()
    }

    return result


defs = form_definitions(
    assets=[
        load_breast_cancer_dataset, train_xgboost_model_with_ray,
        train_vanilla_xgboost_model, form_predictor_with_checkpoint
    ],
    resources={
        "default_io_manager": get_io_manager(
            medium=PersistStorage.OSS,
            base_path="xgboost"
        ),
        "json_io_manager": get_io_manager(
            medium=PersistStorage.OSS,
            base_path="xgboost",
            data_format=DataFormat.JSON
        )
    }
)


ray.shutdown()

PyTorch Lightning Utils

PyTorch is one of the most popular Deep Learning framework in AI field. Fiat Copilot also provides some utilities of Torch Lightning module.

You can first define your lightning config by calling the build_lightning_config function -

def build_lightning_config(
        pl_module: Type[LightningModule],
        data_module: LightningDataModule,
        module_conf: dict,
        trainer_conf: dict,
        ckpt_conf: dict,
        trainer_logger: TensorBoardLogger | None = None
) -> dict[str, Any]t

Then, you can define your Ray Lightning Trainer by invoking the form_lightning_trainer function -

from ray.air import ScalingConfig, RunConfig, Checkpoint

def form_lightning_trainer(
        ptl_config: dict[str, any],
        scaling_conf: ScalingConfig,
        runing_conf: RunConfig,
        checkpoint: Checkpoint | None = None # Resume from a Ray AIR checkpoint or not
)

You can also easily define your Hyperparameter Tuner with the form_lightning_tuner function. With SyncConfig, we can syncing with external storage to persist our tuning experiments' results -

class SchedulerType(Enum):
    ASHA = AsyncHyperBandScheduler
    PBT = PopulationBasedTraining


def form_lightning_tuner(
        name: str,
        ptl_config: dict[str, any],
        num_samples: int,
        scheduler: TrialScheduler,
        scaling_conf: ScalingConfig,
        runing_conf: RunConfig,
        sync_conf: SyncConfig = SyncConfig(
            syncer=None  # By default, we disable syncing behaviors
        )
) -> Tuner

Here is a complete example -

PyTorch Lightning Example

from ray import tune
from ray.air import RunConfig, CheckpointConfig, ScalingConfig
from ray.tune.schedulers import ASHAScheduler

from fiat_copilot.trainer.torch_util import build_lightning_config, form_lightning_tuner
from tune_test.mnist_model import MNISTClassifier, MNISTDataModule
from fiat_copilot.workflow.annotations import as_asset, form_definitions
from fiat_copilot.workflow.ray_utils import long_run_job_client, JobDescription
from fiat_copilot.workflow.storage import get_io_manager, PersistStorage, DataFormat

# Configure Search Space
# The maximum training epochs
num_epochs = 5
# Number of sampls from parameter space
num_samples = 10
# Specify Accelerator
accelerator = "gpu"


@as_asset(
    name="form_lightning_config",
    description="define lightning config",
    io_manager_key="json_io_manager"
)
def form_lightning_config():
    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-4, 1e-1),
    }

    trainer_conf = {
        "num_epochs": num_epochs,
        "accelerator": accelerator
    }

    checkpoint_conf = {
        "monitor": "ckpt_monitor",
        "save_top_k": 2,
        "mode": "max"
    }

    ltn_config = build_lightning_config(
        pl_module=MNISTClassifier,
        data_module=MNISTDataModule(batch_size=64),
        module_conf=config,
        trainer_conf=trainer_conf,
        ckpt_conf=checkpoint_conf,
    )

    return ltn_config


@as_asset(
    name="define_lightning_tuner",
    description="Form Ray lightning tuner"
)
def define_lightning_tuner(form_lightning_config):
    # specify ray worker scaling config
    scaling_conf = ScalingConfig(
        num_workers=3,
        use_gpu=True,
        resources_per_worker={
            "CPU": 4,
            "GPU": 1
        }
    )
    # define runtime config
    run_conf = RunConfig(
        checkpoint_config=CheckpointConfig(
            num_to_keep=2,
            checkpoint_score_attribute="ptl/val_accuracy",
            checkpoint_score_order="max",
        )
    )
    # use ASHA scheduler
    asha_scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2
    )
    # compose the tuner
    ltn_conf = form_lightning_config
    asha_tuner = form_lightning_tuner(
        name="tune_mnist_pbt",
        ptl_config=ltn_conf,
        num_samples=num_samples,
        scheduler=asha_scheduler,
        scaling_conf=scaling_conf,
        runing_conf=run_conf
    )

    return asha_tuner


@as_asset(
    name="tuning_with_ray",
    description="Fit tuner and get best results.",
    io_manager_key="json_io_manager"
)
def tuning_with_ray(define_lightning_tuner):
    cluster_url = "http://10.112.67.227:8265"
    job_description = JobDescription(
        entrypoint="python script.py",
        working_dir="examples/",
        pip_packages=[]
    )
    job_id = long_run_job_client(
        cluster=cluster_url,
        description=JobDescription(
            entrypoint="python script.py",
            working_dir="tune_test/",
            pip_packages=[]
        )
    )

    return {
        "job_id": job_id,
        "cluster": cluster_url,
        "job_descrption": job_description.dict()
    }


defs = form_definitions(
    assets=[form_lightning_config, define_lightning_tuner, tuning_with_ray],
    resources={
        "default_io_manager": get_io_manager(
            medium=PersistStorage.OSS,
            base_path="xgboost"
        ),
        "json_io_manager": get_io_manager(
            medium=PersistStorage.OSS,
            base_path="xgboost",
            data_format=DataFormat.JSON
        )
    }
)

PreviousWorkflow NextServing

Last updated 2 years ago

hashtagOverview

hashtagXGBoost Utils

hashtagPyTorch Lightning Utils

Overview

XGBoost Utils

PyTorch Lightning Utils