diff --git a/tutorials/CMakeLists.txt b/tutorials/CMakeLists.txt index 93f2ab646eb8d..5fb0075afbd98 100644 --- a/tutorials/CMakeLists.txt +++ b/tutorials/CMakeLists.txt @@ -76,6 +76,7 @@ if(MSVC AND NOT win_broken_tests) list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py) list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py) list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py) + list(APPEND dataframe_veto machine_learning/ml_dataloader_resampling.py) # df036* and df037* seem to trigger OS errors when trying to delete the # test files created in the tutorials. It is unclear why. list(APPEND dataframe_veto analysis/dataframe/df036_missingBranches.C) @@ -128,6 +129,7 @@ if (NOT dataframe) list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py) list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py) list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py) + list(APPEND dataframe_veto machine_learning/ml_dataloader_resampling.py) # RooFit tutorials depending on RDataFrame list(APPEND dataframe_veto roofit/roofit/rf408_RDataFrameToRooFit.C @@ -937,6 +939,7 @@ if(pyroot) file(GLOB requires_torch RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} machine_learning/pytorch/*.py machine_learning/ml_dataloader_PyTorch.py + machine_learning/ml_dataloader_resampling.py ) file(GLOB requires_xgboost RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} machine_learning/tmva101_Training.py diff --git a/tutorials/machine_learning/index.md b/tutorials/machine_learning/index.md index 0a39cc3346db2..c069e983ada23 100644 --- a/tutorials/machine_learning/index.md +++ b/tutorials/machine_learning/index.md @@ -137,4 +137,5 @@ | ml_dataloader_NumPy.py | Loading batches of events from a ROOT dataset as Python generators of numpy arrays. | | ml_dataloader_PyTorch.py | Loading batches of events from a ROOT dataset into a basic PyTorch workflow. | | ml_dataloader_TensorFlow.py | Loading batches of events from a ROOT dataset into a basic TensorFlow workflow. | +| ml_dataloader_resampling.py | Loading batches of events from an imbalanced ROOT dataset and balancing them. | diff --git a/tutorials/machine_learning/ml_dataloader_resampling.py b/tutorials/machine_learning/ml_dataloader_resampling.py new file mode 100644 index 0000000000000..91c25c17ed621 --- /dev/null +++ b/tutorials/machine_learning/ml_dataloader_resampling.py @@ -0,0 +1,89 @@ +### \file +### \ingroup tutorial_ml +### \notebook -nodraw +### Example of resampling when one class is underrepresented in the dataset. +### +### \macro_code +### \macro_output +### \author Jonah Ascoli + +import ROOT +import torch + +seed = 42 +torch.manual_seed(seed) + + +# Create an imbalanced dataset with two classes, one of which is underrepresented. +# Here, we'll create two files, one with even numbers and one with odd numbers, +# and then merge them to form a dataset with underrepresented odd numbers. +def make_df(b1_expr, num_events): + return ROOT.RDataFrame(num_events).Define("b1", b1_expr).Define("b2", "(int) b1%2") + + +df_major = make_df("(int) 2 * rdfentry_", 100000) +df_minor = make_df("(int) 2 * rdfentry_ + 1", 1000) + +batch_size = 256 +num_epochs = 20 + +loss_fn = torch.nn.BCEWithLogitsLoss() + + +# Function to train the model and print useful loss statistics +def train_model(model, optimizer, dataloader): + train, val = dataloader.train_test_split(test_size=0.2) + for _ in range(num_epochs): + train_correct = 0 + train_total = 0 + train_losses = [] + model.train() + for X, y in train.as_torch(): + optimizer.zero_grad() + outputs = model(X) + loss = loss_fn(outputs, y) + loss.backward() + optimizer.step() + + preds = (outputs > 0.5).float() + train_correct += (preds == y).sum().item() + train_total += y.size(0) + train_losses.append(loss.item()) + print( + f"Training => Accuracy: {int(train_correct / train_total * 100000) / 100000}; Loss: {int(sum(train_losses) / len(train_losses) * 100000) / 100000}" + ) + val_losses = [] + val_correct = 0 + val_total = 0 + for X, y in val.as_torch(): + with torch.no_grad(): + outputs = model(X) + loss = loss_fn(outputs, y) + + preds = (outputs > 0.5).float() + val_correct += (preds == y).sum().item() + val_total += y.size(0) + val_losses.append(loss.item()) + + print( + f"Validation => Accuracy: {int(val_correct / val_total * 100000) / 100000}; Loss: {int(sum(val_losses) / len(val_losses) * 100000) / 100000}\n" + ) + + +# Oversampling strategy: more batches of the underrepresented class +# Takes more time per epoch, but each epoch is more balanced & effective +dl_oversampled = ROOT.Experimental.ML.RDataLoader( + [df_major, df_minor], + batch_size=batch_size, + target="b2", + set_seed=seed, + load_eager=True, # Must be enabled for resampling + sampling_type="oversampling", # Can also be "undersampling" + sampling_ratio=0.1, # ~10% of the data will be from the underrepresented class, instead of ~1% +) + +oversampling_model = torch.nn.Linear(1, 1) +oversampling_optimizer = torch.optim.Adam(oversampling_model.parameters()) + +print("Training with oversampling:") +train_model(oversampling_model, oversampling_optimizer, dl_oversampled)