TransmogrifAI icon indicating copy to clipboard operation
TransmogrifAI copied to clipboard

Two cache miss case

Open waruto210 opened this issue 1 year ago • 0 comments

Describe the bug We have developed a tool named AutoCache to detect cache miss bugs in spark applications automatically. During evaluation, we found two cache miss bugs in TransmogrifAI, could you please consider adding cache for them. code path:

  • core/src/main/scala/com/salesforce/op/stages/impl/tuning/OpTrainValidationSplit.scala:124
  • core/src/main/scala/com/salesforce/op/evaluators/OpEvaluatorBase.scala:237

To Reproduce env:

  • Spark 2.4.5
  • TransmogrifAI 0.7.0
  • Scala 2.11.12 run the following example code:
import com.salesforce.op._
import com.salesforce.op.evaluators.Evaluators
import com.salesforce.op.features.FeatureBuilder
import com.salesforce.op.features.types._
import com.salesforce.op.readers.DataReaders
import com.salesforce.op.stages.impl.regression.RegressionModelSelector
import com.salesforce.op.stages.impl.regression.RegressionModelsToTry._
import org.apache.spark.sql.SparkSession

/**
 * Define a case class representing the Boston housing data
 *
 * @param rowId   id of the house
 * @param crim    per capita crime rate by town
 * @param zn      proportion of residential land zoned for lots over 25,000 sq.ft.
 * @param indus   proportion of non-retail business acres per town
 * @param chas    Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
 * @param nox     nitric oxides concentration (parts per 10 million)
 * @param rm      average number of rooms per dwelling
 * @param age     proportion of owner-occupied units built prior to 1940
 * @param dis     weighted distances to five Boston employment centres
 * @param rad     index of accessibility to radial highways
 * @param tax     full-value property-tax rate per $10,000
 * @param ptratio pupil-teacher ratio by town
 * @param b       1000(Bk - 0.63)**2 where Bk is the proportion of blacks by town
 * @param lstat   % lower status of the population
 * @param medv    median value of owner-occupied homes in $1000's
 */
case class BostonHouse
(
  rowId: Int,
  crim: Double,
  zn: Double,
  indus: Double,
  chas: String,
  nox: Double,
  rm: Double,
  age: Double,
  dis: Double,
  rad: Int,
  tax: Double,
  ptratio: Double,
  b: Double,
  lstat: Double,
  medv: Double
)

/**
 * A simplified TransmogrifAI example classification app using the Boston dataset
 */
object OpBostonSimpleTest {

  /**
   * Run this from the command line with
   * ./gradlew sparkSubmit -Dmain=com.salesforce.hw.OpBostonSimple -Dargs=/full/path/to/csv/file
   */
  def main(args: Array[String]): Unit = {

    val csvFilePath = "data/housingData.csv"

    // Set up a SparkSession as normal
    implicit val spark = SparkSession.builder.appName("OpBostonSimpleTest").master("local[*]").getOrCreate()
    import spark.implicits._ // Needed for Encoders for the BostonHouse case class

    ////////////////////////////////////////////////////////////////////////////////
    // RAW FEATURE DEFINITIONS
    /////////////////////////////////////////////////////////////////////////////////

    // Define features using the OP types based on the data
    val rowId = FeatureBuilder.Integral[BostonHouse].extract(_.rowId.toIntegral).asPredictor
    val crim = FeatureBuilder.RealNN[BostonHouse].extract(_.crim.toRealNN).asPredictor
    val zn = FeatureBuilder.RealNN[BostonHouse].extract(_.zn.toRealNN).asPredictor
    val indus = FeatureBuilder.RealNN[BostonHouse].extract(_.indus.toRealNN).asPredictor
    val chas = FeatureBuilder.PickList[BostonHouse].extract(x => Option(x.chas).toPickList).asPredictor
    val nox = FeatureBuilder.RealNN[BostonHouse].extract(_.nox.toRealNN).asPredictor
    val rm = FeatureBuilder.RealNN[BostonHouse].extract(_.rm.toRealNN).asPredictor
    val age = FeatureBuilder.RealNN[BostonHouse].extract(_.age.toRealNN).asPredictor
    val dis = FeatureBuilder.RealNN[BostonHouse].extract(_.dis.toRealNN).asPredictor
    val rad = FeatureBuilder.Integral[BostonHouse].extract(_.rad.toIntegral).asPredictor
    val tax = FeatureBuilder.RealNN[BostonHouse].extract(_.tax.toRealNN).asPredictor
    val ptratio = FeatureBuilder.RealNN[BostonHouse].extract(_.ptratio.toRealNN).asPredictor
    val b = FeatureBuilder.RealNN[BostonHouse].extract(_.b.toRealNN).asPredictor
    val lstat = FeatureBuilder.RealNN[BostonHouse].extract(_.lstat.toRealNN).asPredictor
    val medv = FeatureBuilder.RealNN[BostonHouse].extract(_.medv.toRealNN).asResponse


    // Define a feature of type vector containing all the predictors you'd like to use
    val features = Seq(crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat).transmogrify()

    val label = medv

    val checkedFeatures = label.sanityCheck(features, removeBadFeatures = true)

    ////////////////////////////////////////////////////////////////////////////////
    // WORKFLOW DEFINITION
    /////////////////////////////////////////////////////////////////////////////////

    // Define the model we want to use (here a simple linear regression) and get the resulting output

    val prediction = RegressionModelSelector
      .withTrainValidationSplit(
        modelTypesToUse = Seq(OpLinearRegression), seed = 42)
      .setInput(label, checkedFeatures).getOutput()

    val evaluator = Evaluators.Regression().setLabelCol(label).setPredictionCol(prediction)

    ////////////////////////////////////////////////////////////////////////////////
    // WORKFLOW
    /////////////////////////////////////////////////////////////////////////////////

    val dataReader = DataReaders.Simple.csvCase[BostonHouse](path = Option(csvFilePath), key = _.rowId.toString())

    val workflow = new OpWorkflow().setResultFeatures(prediction, label).setReader(dataReader)

    val model = workflow.train()


    // Extract information (i.e. feature importance) via model insights
    val modelInsights = model.modelInsights(prediction)
    val modelFeatures = modelInsights.features.flatMap( feature => feature.derivedFeatures)
    val featureContributions = modelFeatures.map( feature => (feature.derivedFeatureName,
      feature.contribution.map( contribution => math.abs(contribution))
        .foldLeft(0.0) { (max, contribution) => math.max(max, contribution)}))
    val sortedContributions = featureContributions.sortBy( contribution => -contribution._2)

    val topNum = math.min(20, sortedContributions.size)
    // Manifest the result features of the workflow
    val (scores, metrics) = model.scoreAndEvaluate(evaluator = evaluator)

    // Stop Spark gracefully
    System.in.read()
  }
}

waruto210 avatar Aug 31 '22 06:08 waruto210