djl
djl copied to clipboard
Logistic regression : Accuracy of 1.0 (is there an error in my code ?)
Description
Hello. I'm trying to build a simple logistic model with DJL. I have an accuracy of 1 with my logistic regression model.
Expected Behavior
I expected to have around 0.92 (as with PyTorch).
Below you will find my Java code :
package centralized;
import java.io.IOException;
import tech.tablesaw.api.Table;
import ai.djl.Model;
import ai.djl.ndarray.NDManager;
import ai.djl.ndarray.NDArray;
import ai.djl.nn.SequentialBlock;
import ai.djl.nn.core.Linear;
import ai.djl.training.dataset.ArrayDataset;
import ai.djl.training.dataset.RandomAccessDataset;
import ai.djl.training.loss.Loss;
import ai.djl.training.loss.SigmoidBinaryCrossEntropyLoss;
import ai.djl.training.tracker.Tracker;
import ai.djl.training.optimizer.Optimizer;
import ai.djl.training.TrainingConfig;
import ai.djl.training.DefaultTrainingConfig;
import ai.djl.training.Trainer;
import ai.djl.training.evaluator.Accuracy;
import ai.djl.training.TrainingResult;
import ai.djl.training.listener.TrainingListener;
import ai.djl.metric.Metrics;
import ai.djl.training.EasyTrain;
import ai.djl.ndarray.types.Shape;
import ai.djl.translate.TranslateException;
public class Centralized {
public static ArrayDataset loadArray(NDArray features, NDArray labels, int batchSize, boolean shuffle) {
return new ArrayDataset.Builder()
.setData(features) // set the features
.optLabels(labels) // set the labels
.setSampling(batchSize, shuffle) // set the batch size and random sampling
.build();
}
public static void main(String[] args) throws IOException,TranslateException {
Table spambase = Table.read().csv("spambase.csv");
Table inputs = spambase.copy().removeColumns("is_spam");
Table outputs = spambase.copy().retainColumns("is_spam");
NDManager manager = NDManager.newBaseManager();
NDArray x = manager.create(inputs.as().floatMatrix());
NDArray y = manager.create(outputs.as().intMatrix());
int batchSize = 10;
ArrayDataset dataset = loadArray(x, y, batchSize, true);
RandomAccessDataset[] datasets_split = dataset.randomSplit(80, 20);
RandomAccessDataset trainingSet = datasets_split[0];
RandomAccessDataset validationSet = datasets_split[1];
Model model = Model.newInstance("logistic-regression");
SequentialBlock net = new SequentialBlock();
Linear linearBlock = Linear.builder().optBias(true).setUnits(1).build();
net.add(linearBlock);
model.setBlock(net);
Loss loss = new SigmoidBinaryCrossEntropyLoss();
Tracker lrt = Tracker.fixed(0.1f);
Optimizer sgd = Optimizer.sgd().setLearningRateTracker(lrt).build();
TrainingConfig config = new DefaultTrainingConfig(loss)
.optOptimizer(sgd) // Optimizer
.optDevices(manager.getEngine().getDevices(0)) // CPU
.addEvaluator(new Accuracy()) // Model Accuracy
.addTrainingListeners(TrainingListener.Defaults.logging()); // Logging
Trainer trainer = model.newTrainer(config);
trainer.initialize(new Shape(1, inputs.columnCount()));
Metrics metrics = new Metrics();
trainer.setMetrics(metrics);
int numEpochs = 1;
EasyTrain.fit(trainer, numEpochs, trainingSet, validationSet);
// System.out.println(metrics.getMetricNames());
double accuracy = metrics.mean("validate_epoch_Accuracy");
System.out.println(accuracy);
TrainingResult result = trainer.getTrainingResult();
System.out.println(result);
}
}
This is the output:
> Task :app:runCentralized
SLF4J(I): Connected with provider of type [org.slf4j.jul.JULServiceProvider]
avr. 24, 2024 4:19:39 AM ai.djl.pytorch.engine.PtEngine newInstance
INFOS: PyTorch graph executor optimizer is enabled, this may impact your inference latency and throughput. See: https://docs.djl.ai/docs/development/inference_performance_optimization.html#graph-executor-optimization
avr. 24, 2024 4:19:39 AM ai.djl.pytorch.engine.PtEngine newInstance
INFOS: Number of inter-op threads is 8
avr. 24, 2024 4:19:39 AM ai.djl.pytorch.engine.PtEngine newInstance
INFOS: Number of intra-op threads is 12
avr. 24, 2024 4:19:39 AM ai.djl.training.listener.LoggingTrainingListener onTrainingBegin
INFOS: Training on: cpu().
avr. 24, 2024 4:19:39 AM ai.djl.training.listener.LoggingTrainingListener onTrainingBegin
INFOS: Load PyTorch Engine Version 2.1.1 in 0,031 ms.
Training: 100% |████████████████████████████████████████| Accuracy: 1,00, SigmoidBinaryCrossEntropyLoss: 1155,97
Validating: 100% |████████████████████████████████████████|
avr. 24, 2024 4:19:39 AM ai.djl.training.listener.LoggingTrainingListener onEpoch
INFOS: Epoch 1 finished.
avr. 24, 2024 4:19:39 AM ai.djl.training.listener.LoggingTrainingListener onEpoch
INFOS: Train: Accuracy: 1,00, SigmoidBinaryCrossEntropyLoss: 1148,56
avr. 24, 2024 4:19:39 AM ai.djl.training.listener.LoggingTrainingListener onEpoch
INFOS: Validate: Accuracy: 1,00, SigmoidBinaryCrossEntropyLoss: 1097,44
1.0
{
"epoch": 1,
"evaluations": {
"validate_Accuracy": 1.0,
"train_SigmoidBinaryCrossEntropyLoss": 1148.5554,
"validate_SigmoidBinaryCrossEntropyLoss": 1097.441,
"train_Accuracy": 1.0,
"train_loss": 1148.5554,
"validate_loss": 1097.441
}
}
I created a similar model in PyTorch:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
df = pd.read_csv("spambase.csv")
data = df.to_numpy()
y = LabelEncoder().fit_transform(data[:, 57])
X = np.delete(data, [57], axis=1).astype('float64')
X = StandardScaler().fit_transform(X)
X = torch.tensor(X).float()
y = torch.tensor(y).float().reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True)
loader = DataLoader(list(zip(X_train, y_train)), shuffle=True, batch_size=10)
n_inputs = 57
n_outputs = 1
model = nn.Sequential(
nn.Linear(n_inputs, n_outputs),
nn.Sigmoid()
)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
loss_fn = nn.BCELoss()
n_epochs = 1
model.train()
for epoch in range(n_epochs):
for X_batch, y_batch in loader:
y_pred = model(X_batch)
loss = loss_fn(y_pred, y_batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# evaluate accuracy after training
model.eval()
y_pred = model(X_test)
acc = (y_pred.round() == y_test).float().mean()
acc = float(acc)
print("Model accuracy: %.2f" % acc)
And the output:
Model accuracy: 0.92
Is there an error with my Java/DJL code ? I tried to follow the "Dive into Deep Learning" book and the documentation but I'm not sure of my code. Are my Java and Python codes equivalent?
ps: The dataset is from https://archive.ics.uci.edu/dataset/94/spambase, and I transformed it into a csv.
Thanks
For information this is the model parameters before learning (if you know how to write all the values instead of "37 more" please tell me):
weight: (1, 57) cpu() float32 hasGradient
[[-0.0309, -0.0797, 0.3092, -0.1011, 0.2231, 0.118 , 0.1488, 0.0543, -0.5098, -0.4196, -0.1896, 0.0717, 0.0586, -0.0586, 0.0395, 0.0914, 0.3283, -0.1463, -0.3702, 0.1664, ... 37 more],
]
bias: (1) cpu() float32 hasGradient
[0.]
And this is after training:
weight: (1, 57) cpu() float32 hasGradient
[[ -0.3021, -3.2209, -0.8564, 0.5633, 0.0698, 0.1584, 1.1403, 0.4022, -0.3137, -1.5199, -0.0627, -5.3316, -0.1716, -0.3107, 0.3686, 1.119 , 0.8939, -0.0675, -8.1434, 0.9069, ... 37 more],
]
bias: (1) cpu() float32 hasGradient
[-9.5678]
I tried to use Predictor
to check the values predicted, with this code
Translator translator = new NoopTranslator();
Predictor predictor = model.newPredictor(translator);
for(Batch b: validationSet.getData(manager)) {
NDList data = b.getData();
NDArray prediction = ((NDList) predictor.predict(data)).singletonOrThrow();
NDArray truelabel = b.getLabels().singletonOrThrow();
System.out.println("Predicted is: " + prediction.toString());
System.out.println("True value is: " + truelabel.toString());
}
And this is the result (one batch but it's similar for all batch):
Predicted is: ND: (10, 1) cpu() float32
[[-10483.418 ],
[ -1452.9528],
[ -7738.9502],
[ -709.6656],
[ -1241.8141],
[ -3426.0046],
[ -188.6846],
[ -580.9608],
[ -98.5921],
[ -1608.1211],
]
True value is: ND: (10, 1) cpu() int32
[[ 1],
[ 0],
[ 1],
[ 0],
[ 0],
[ 1],
[ 1],
[ 0],
[ 0],
[ 1],
]
Then I tried to use a sigmoid operations on the predicted values, like this:
for(Batch b: validationSet.getData(manager)) {
NDList data = b.getData();
NDArray prediction = ((NDList) predictor.predict(data)).singletonOrThrow();
NDArray prediction_binary = Activation.sigmoid(prediction);
NDArray truelabel = b.getLabels().singletonOrThrow();
System.out.println("Predicted is: " + prediction_binary.toString());
System.out.println("True value is: " + truelabel.toString());
}
and results are like this :
Predicted is: ND: (10, 1) cpu() float32
[[1. ],
[1. ],
[1. ],
[1. ],
[1. ],
[1. ],
[1. ],
[0.0012],
[1. ],
[1. ],
]
True value is: ND: (10, 1) cpu() int32
[[ 0],
[ 0],
[ 0],
[ 1],
[ 0],
[ 0],
[ 0],
[ 0],
[ 1],
[ 1],
]
Any idea where the problem is?
Hey @MohamedLEGH, I have question is NoopTranslator() is used for only for classification or can used for the Regression?
as well this is because weights I guess try to StandardScale or min max scaling on the input data.
Ok, I have updated the code, I made the following modifications:
- The data are normalized (remove mean and divide by standard deviation)
- The BinaryAccuracy is used instead of Accuracy
Below you find the code updated :
package machine_learning;
import java.io.IOException;
import java.util.Random;
import tech.tablesaw.api.Table;
import ai.djl.Model;
import ai.djl.ndarray.NDManager;
import ai.djl.ndarray.NDArray;
import ai.djl.ndarray.NDList;
import ai.djl.ndarray.types.DataType;
import ai.djl.nn.SequentialBlock;
import ai.djl.nn.core.Linear;
import ai.djl.nn.Parameter;
import ai.djl.nn.Activation;
import ai.djl.training.dataset.ArrayDataset;
import ai.djl.training.dataset.RandomAccessDataset;
import ai.djl.training.loss.Loss;
import ai.djl.training.loss.SigmoidBinaryCrossEntropyLoss;
import ai.djl.training.tracker.Tracker;
import ai.djl.training.optimizer.Optimizer;
import ai.djl.training.TrainingConfig;
import ai.djl.training.DefaultTrainingConfig;
import ai.djl.training.Trainer;
import ai.djl.training.evaluator.Accuracy;
import ai.djl.training.evaluator.BinaryAccuracy;
import ai.djl.training.TrainingResult;
import ai.djl.training.listener.TrainingListener;
import ai.djl.training.dataset.Batch;
import ai.djl.training.EasyTrain;
import ai.djl.training.initializer.ConstantInitializer;
import ai.djl.metric.Metrics;
import ai.djl.ndarray.types.Shape;
import ai.djl.translate.TranslateException;
import ai.djl.translate.Translator;
import ai.djl.translate.NoopTranslator;
import ai.djl.inference.Predictor;
import ai.djl.util.Pair;
public class LogisticRegression {
public static void main(String[] args) throws IOException, TranslateException {
Table spambase = Table.read().csv("spambase.csv");
Table inputs = spambase.copy().removeColumns("is_spam");
Table outputs = spambase.copy().retainColumns("is_spam");
NDManager manager = NDManager.newBaseManager();
NDArray x = manager.create(inputs.as().floatMatrix());
NDArray scaled_x = Utils.normalize(x);
NDArray y = manager.create(outputs.as().intMatrix());
int batchSize = inputs.rowCount();
ArrayDataset dataset = Utils.loadArray(scaled_x, y, batchSize, true);
RandomAccessDataset[] datasets_split = dataset.randomSplit(80, 20);
ArrayDataset trainingSet = (ArrayDataset) datasets_split[0];
ArrayDataset testingSet = (ArrayDataset) datasets_split[1];
Model model = Model.newInstance("logistic");
SequentialBlock net = new SequentialBlock();
Linear linearBlock = Linear.builder().optBias(true).setUnits(1).build();
net.add(linearBlock);
//net.setInitializer(new ConstantInitializer(0), Parameter.Type.WEIGHT);
//net.initialize(manager, DataType.FLOAT32, x.getShape());
model.setBlock(net);
Loss loss = new SigmoidBinaryCrossEntropyLoss();
float lr = 0.01f;
Tracker lrt = Tracker.fixed(lr);
Optimizer sgd = Optimizer.sgd().setLearningRateTracker(lrt).build();
TrainingConfig config = new DefaultTrainingConfig(loss)
.optOptimizer(sgd) // Optimizer
.optDevices(manager.getEngine().getDevices(0)) // CPU
.addEvaluator(new BinaryAccuracy()) // Model Accuracy
.addTrainingListeners(TrainingListener.Defaults.logging()); // Logging
Trainer trainer = model.newTrainer(config);
trainer.initialize(new Shape(1, inputs.columnCount()));
Metrics metrics = new Metrics();
trainer.setMetrics(metrics);
int numEpochs = 1000; // only 10 with initialization of weights to 0
EasyTrain.fit(trainer, numEpochs, trainingSet, testingSet);
}
}
And the results are now close to 90% accuracy.
mai 14, 2024 5:17:01 PM ai.djl.training.listener.LoggingTrainingListener onEpoch
INFOS: Train: BinaryAccuracy: 0,91, SigmoidBinaryCrossEntropyLoss: 0,29
mai 14, 2024 5:17:01 PM ai.djl.training.listener.LoggingTrainingListener onEpoch
INFOS: Validate: BinaryAccuracy: 0,89, SigmoidBinaryCrossEntropyLoss: 0,31
If the weights are initialized with 0, the convergence is way much faster.I have solve my issue but I think the Accuracy() metric should take into account the binary case instead of having a BinaryAccuracy() metric.
Hey @MohamedLEGH, I have question is NoopTranslator() is used for only for classification or can used for the Regression?
as well this is because weights I guess try to StandardScale or min max scaling on the input data.
I suppose you can also use it for Regression. I don't know as I'm only doing classification right now but I suppose it should work.
Hey @MohamedLEGH, I have question is NoopTranslator() is used for only for classification or can used for the Regression?
as well this is because weights I guess try to StandardScale or min max scaling on the input data.
I suppose you can also use it for Regression. I don't know as I'm only doing classification right now but I suppose it should work.
Yeah it worked.
the code updated :
Hey @MohamedLEGH , So you are using utils.normalise is utils your own custom class? and how you are performing the normalisation can you share the code snippet. Cause I am trying classification to normalise it but I am not getting proper output so.
the code updated :
Hey @MohamedLEGH , So you are using utils.normalise is utils your own custom class? and how you are performing the normalisation can you share the code snippet. Cause I am trying classification to normalise it but I am not getting proper output so.
My Utils.java code below, hope it helps:
package machine_learning;
import ai.djl.training.dataset.Record;
import ai.djl.ndarray.NDArray;
import ai.djl.ndarray.NDManager;
import ai.djl.training.dataset.ArrayDataset;
public class Utils {
public static ArrayDataset loadArray(NDArray features, NDArray labels, int batchSize, boolean shuffle) {
return new ArrayDataset.Builder()
.setData(features) // set the features
.optLabels(labels) // set the labels
.setSampling(batchSize, shuffle) // set the batch size and random sampling
.build();
}
public static NDArray mean(NDArray X) {
return X.mean(new int[]{0});
}
public static NDArray std(NDArray X) {
NDArray mean = mean(X);
NDArray squaredDiff = X.sub(mean).pow(2);
NDArray variance = squaredDiff.mean(new int[]{0});
return variance.sqrt();
}
public static NDArray normalize(NDArray X) {
return X.sub(mean(X)).div(std(X));
}
}