TransmogrifAI copied to clipboard
Two cache miss case
Describe the bug
We have developed a tool named AutoCache
to detect cache miss bugs in spark applications automatically. During evaluation, we found two cache miss bugs in TransmogrifAI
, could you please consider adding cache for them.
code path:
To Reproduce env:
Spark 2.4.5
TransmogrifAI 0.7.0
Scala 2.11.12
run the following example code:
import com.salesforce.op._
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.features.FeatureBuilder
import com.salesforce.op.features.types._
import com.salesforce.op.readers.DataReaders
import com.salesforce.op.stages.impl.regression.RegressionModelSelector
import com.salesforce.op.stages.impl.regression.RegressionModelsToTry._
import org.apache.spark.sql.SparkSession
* Define a case class representing the Boston housing data
* @param rowId id of the house
* @param crim per capita crime rate by town
* @param zn proportion of residential land zoned for lots over 25,000 sq.ft.
* @param indus proportion of non-retail business acres per town
* @param chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
* @param nox nitric oxides concentration (parts per 10 million)
* @param rm average number of rooms per dwelling
* @param age proportion of owner-occupied units built prior to 1940
* @param dis weighted distances to five Boston employment centres
* @param rad index of accessibility to radial highways
* @param tax full-value property-tax rate per $10,000
* @param ptratio pupil-teacher ratio by town
* @param b 1000(Bk - 0.63)**2 where Bk is the proportion of blacks by town
* @param lstat % lower status of the population
* @param medv median value of owner-occupied homes in $1000's
case class BostonHouse
rowId: Int,
crim: Double,
zn: Double,
indus: Double,
chas: String,
nox: Double,
rm: Double,
age: Double,
dis: Double,
rad: Int,
tax: Double,
ptratio: Double,
b: Double,
lstat: Double,
medv: Double
* A simplified TransmogrifAI example classification app using the Boston dataset
object OpBostonSimpleTest {
* Run this from the command line with
* ./gradlew sparkSubmit -Dmain=com.salesforce.hw.OpBostonSimple -Dargs=/full/path/to/csv/file
def main(args: Array[String]): Unit = {
val csvFilePath = "data/housingData.csv"
// Set up a SparkSession as normal
implicit val spark = SparkSession.builder.appName("OpBostonSimpleTest").master("local[*]").getOrCreate()
import spark.implicits._ // Needed for Encoders for the BostonHouse case class
// Define features using the OP types based on the data
val rowId = FeatureBuilder.Integral[BostonHouse].extract(_.rowId.toIntegral).asPredictor
val crim = FeatureBuilder.RealNN[BostonHouse].extract(_.crim.toRealNN).asPredictor
val zn = FeatureBuilder.RealNN[BostonHouse].extract(_.zn.toRealNN).asPredictor
val indus = FeatureBuilder.RealNN[BostonHouse].extract(_.indus.toRealNN).asPredictor
val chas = FeatureBuilder.PickList[BostonHouse].extract(x => Option(x.chas).toPickList).asPredictor
val nox = FeatureBuilder.RealNN[BostonHouse].extract(_.nox.toRealNN).asPredictor
val rm = FeatureBuilder.RealNN[BostonHouse].extract(_.rm.toRealNN).asPredictor
val age = FeatureBuilder.RealNN[BostonHouse].extract(_.age.toRealNN).asPredictor
val dis = FeatureBuilder.RealNN[BostonHouse].extract(_.dis.toRealNN).asPredictor
val rad = FeatureBuilder.Integral[BostonHouse].extract(_.rad.toIntegral).asPredictor
val tax = FeatureBuilder.RealNN[BostonHouse].extract(
val ptratio = FeatureBuilder.RealNN[BostonHouse].extract(_.ptratio.toRealNN).asPredictor
val b = FeatureBuilder.RealNN[BostonHouse].extract(_.b.toRealNN).asPredictor
val lstat = FeatureBuilder.RealNN[BostonHouse].extract(_.lstat.toRealNN).asPredictor
val medv = FeatureBuilder.RealNN[BostonHouse].extract(_.medv.toRealNN).asResponse
// Define a feature of type vector containing all the predictors you'd like to use
val features = Seq(crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat).transmogrify()
val label = medv
val checkedFeatures = label.sanityCheck(features, removeBadFeatures = true)
// Define the model we want to use (here a simple linear regression) and get the resulting output
val prediction = RegressionModelSelector
modelTypesToUse = Seq(OpLinearRegression), seed = 42)
.setInput(label, checkedFeatures).getOutput()
val evaluator = Evaluators.Regression().setLabelCol(label).setPredictionCol(prediction)
val dataReader = DataReaders.Simple.csvCase[BostonHouse](path = Option(csvFilePath), key = _.rowId.toString())
val workflow = new OpWorkflow().setResultFeatures(prediction, label).setReader(dataReader)
val model = workflow.train()
// Extract information (i.e. feature importance) via model insights
val modelInsights = model.modelInsights(prediction)
val modelFeatures = modelInsights.features.flatMap( feature => feature.derivedFeatures)
val featureContributions = feature => (feature.derivedFeatureName, contribution => math.abs(contribution))
.foldLeft(0.0) { (max, contribution) => math.max(max, contribution)}))
val sortedContributions = featureContributions.sortBy( contribution => -contribution._2)
val topNum = math.min(20, sortedContributions.size)
// Manifest the result features of the workflow
val (scores, metrics) = model.scoreAndEvaluate(evaluator = evaluator)
// Stop Spark gracefully