spark-excel icon indicating copy to clipboard operation
spark-excel copied to clipboard

Unexpected formula parsing on string columns

Open surendra-mt opened this issue 3 years ago • 11 comments

Hi Team, We are facing an issue while writing pyspark df to xslx. One of our column (string type) has some string like this: = >85. When converting the df to xlsx, it is breaking because it tries to parse this formula resulting in error.

 shadeio.poi.ss.formula.FormulaParseException: Parse error near char 1 '>' in specified formula ' >85'. Expected cell ref or constant literal
	at shadeio.poi.ss.formula.FormulaParser.expected(FormulaParser.java:269)
	at shadeio.poi.ss.formula.FormulaParser.parseSimpleFactor(FormulaParser.java:1553)
	at shadeio.poi.ss.formula.FormulaParser.percentFactor(FormulaParser.java:1506)
	at shadeio.poi.ss.formula.FormulaParser.powerFactor(FormulaParser.java:1493)
	at shadeio.poi.ss.formula.FormulaParser.Term(FormulaParser.java:1867)
	at shadeio.poi.ss.formula.FormulaParser.additiveExpression(FormulaParser.java:1994)
	at shadeio.poi.ss.formula.FormulaParser.concatExpression(FormulaParser.java:1978)
	at shadeio.poi.ss.formula.FormulaParser.comparisonExpression(FormulaParser.java:1935)
	at shadeio.poi.ss.formula.FormulaParser.intersectionExpression(FormulaParser.java:1908)
	at shadeio.poi.ss.formula.FormulaParser.unionExpression(FormulaParser.java:1889)
	at shadeio.poi.ss.formula.FormulaParser.parse(FormulaParser.java:2036)
	at shadeio.poi.ss.formula.FormulaParser.parse(FormulaParser.java:170)
	at shadeio.poi.xssf.usermodel.XSSFCell.setFormula(XSSFCell.java:550)
	at shadeio.poi.xssf.usermodel.XSSFCell.setCellFormulaImpl(XSSFCell.java:526)
	at shadeio.poi.ss.usermodel.CellBase.setCellFormula(CellBase.java:132)
	at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.convertCell(Model2XlsxConversions.scala:49)
	at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$$anonfun$convertRow$4.apply(Model2XlsxConversions.scala:143)
	at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$$anonfun$convertRow$4.apply(Model2XlsxConversions.scala:143)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.convertRow(Model2XlsxConversions.scala:143)
	at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$$anonfun$writeToExistingSheet$1.apply(Model2XlsxConversions.scala:156)
	at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$$anonfun$writeToExistingSheet$1.apply(Model2XlsxConversions.scala:156)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.writeToExistingSheet(Model2XlsxConversions.scala:156)
	at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$$anonfun$writeToExistingWorkbook$4.apply(Model2XlsxConversions.scala:324)
	at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$$anonfun$writeToExistingWorkbook$4.apply(Model2XlsxConversions.scala:321)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
	at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.writeToExistingWorkbook(Model2XlsxConversions.scala:321)
	at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$XlsxWorkbook.writeToExisting(Model2XlsxConversions.scala:421)
	at com.crealytics.spark.excel.ExcelFileSaver.writeToWorkbook$1(ExcelFileSaver.scala:40)
	at com.crealytics.spark.excel.ExcelFileSaver.save(ExcelFileSaver.scala:48)
	at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:60)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
	at sun.reflect.GeneratedMethodAccessor249.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Sample code:


temp_data = [("= >85", "valid sting","valid string 2"),(">=85",""">85%
RACE updation in Veeva""",""">10.7 for HCP
> 8 for HCOs""")]

rdd = sc.parallelize(temp_data)

temp_df = rdd.toDF()

with tempfile.TemporaryDirectory() as directory:
    write_df_to_xlsx_format(temp_df,directory)

Expected Behavior

The above code should generate a xslx.

Possible Solution

Format parsing should be disable inside string.

Steps to Reproduce (for bugs)

Can run above sample code for the same.

Context

We use this plugin to covert spark df to xslx and then send the report as an email to client.

Your Environment

ENV: prod

surendra-mt avatar Dec 06 '21 07:12 surendra-mt

Hi @surendra-mt Please help share the mentioned method write_df_to_xlsx_format. Or just part of it, so that we can reproduce the issue with your temp_data. Sincerely,

quanghgx avatar Dec 10 '21 13:12 quanghgx

Hi @quanghgx apologies. I thought I have included that in the main body. Here is the function code.

def write_df_to_xlsx_format(df, directory, sheetName="'Sheet1'!A1"):
    print("writing df to temp dir as xlsx file")
    df.coalesce(1).write.format("com.crealytics.spark.excel").option(
        "dataAddress", sheetName
    ).option("header", "true").option("dateFormat", "yy-mm-dd").option(
        "timestampFormat", "yyyy-MM-dd HH:mm"
    ).mode(
        "overwrite"
    ).save(
        directory + "/temp.xlsx"
    )

surendra-mt avatar Dec 15 '21 12:12 surendra-mt

Hi @quanghgx any update on this?

surendra-mt avatar Dec 27 '21 05:12 surendra-mt

Hi @surendra-mt I am going to take a try with weekend and get back to you. Thanks for providing all needed detail.

quanghgx avatar Dec 27 '21 14:12 quanghgx

Hi @quanghgx sorry for bothering you. We have a pending ticket for this on prod. Let us know, if there is any update on this. Thanks.

surendra-mt avatar Jan 06 '22 12:01 surendra-mt

I am so sorry for long delay from my side. Checking..

quanghgx avatar Jan 10 '22 12:01 quanghgx

Hi @surendra-mt

Summary:

  • I can reproduce the issue with spark-excel V1 implementation, and will look into it for detail later.
  • V2 implement works, if it's possible for you, please take a try. Detail bellow.

A. I can reproduce given issue with v1 implement and end up with same error message:

22/01/10 23:32:01 INFO DAGScheduler: Job 0 finished: toList at DataLocator.scala:104, took 0.473501 s
Exception in thread "main" org.apache.poi.ss.formula.FormulaParseException: Parse error near char 1 '>' in specified formula ' >85'. Expected cell ref or constant literal
	at org.apache.poi.ss.formula.FormulaParser.expected(FormulaParser.java:272)
FormulaParser.java:272
	at org.apache.poi.ss.formula.FormulaParser.parseSimpleFactor(FormulaParser.java:1544)
FormulaParser.java:1544
	at org.apache.poi.ss.formula.FormulaParser.percentFactor(FormulaParser.java:1497)
FormulaParser.java:1497
	at org.apache.poi.ss.formula.FormulaParser.powerFactor(FormulaParser.java:1484)
FormulaParser.java:1484
	at org.apache.poi.ss.formula.FormulaParser.Term(FormulaParser.java:1858)
FormulaParser.java:1858

B. Is it possible for you to use V2 implement?

  • Change .format("com.crealytics.spark.excel") --> .format("excel")
  • V2, will alway write to a directory, inside that will be the data your need. And it expects the parent folder is existed bebore writing.

Here are my full experimental snippet:

package com.crealytics.spark

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.Row
import java.util
import scala.collection.JavaConverters._
import org.apache.spark.sql.SaveMode

object AppEntry {
  def main(args: Array[String]) = {
    println("Hello, world")
    val spark = SparkSession
      .builder()
      .master("local")
      .appName("Spark SQL basic example")
      .config("spark.ui.enabled", false)
      .getOrCreate()

    val schema = StructType(
      List(
        StructField("C1", StringType, true),
        StructField("C2", StringType, true),
        StructField("C3", StringType, true)
      )
    )

    val temp_data: util.List[Row] = List(
      Row("= >85", "valid sting", "valid string 2"),
      Row(">=85", """>85%RACE updation in Veeva""", """>10.7 for HCP> 8 for HCOs""")
    ).asJava

    val df = spark.createDataFrame(temp_data, schema)
    df.show()

    /* V2 implement, works*/
    // df.write
    //     .format("excel")
    //     .option("dataAddress", "'Sheet1'!A1")
    //     .mode(SaveMode.Append)
    //     .save("/home/quanghgx/Downloads/476_surendra-mt.xlsx")

    /* Same issue with surendra-mt*/
    df.coalesce(1).write.format("com.crealytics.spark.excel")
        .option("dataAddress", "'Sheet1'!A1")
        .option("header", "true")
        .option("dateFormat", "yy-mm-dd")
        .option("timestampFormat", "yyyy-MM-dd HH:mm")
        .mode("overwrite")
        .save("/home/quanghgx/Downloads//temp.xlsx")

    try {
      spark.close()
    } catch {
      case _: Exception => () // NOP
    }
  }
}

C. Output of spark-excel V2 implementation is a directory: And the data file contains correct value Screenshot from 2022-01-10 23-43-51

quanghgx avatar Jan 10 '22 16:01 quanghgx

Hi @surendra-mt Do you have time to take another try with .format("excel")?

quanghgx avatar Jan 15 '22 12:01 quanghgx

Hi @quanghgx Sorry, I was busy with some other task. Will check this in this week. Thanks.

surendra-mt avatar Jan 19 '22 09:01 surendra-mt

Hi @quanghgx I tried with v2. These are the issues I am facing with it. So currently we are using spark 2.4.3 and v2 uses functionality that requires spark 3.2.1+ (for example: org.apache.spark.sql.connector.catalog.TableProvider).

Second, when I tried to build jar for spark version 2.4.3, I found out that even that won't work as we are on scala 2.11.

For both of the issues, I will ask code owner if it is possible to upgrade versions.

Thanks a lot for your help!

surendra-mt avatar Feb 02 '22 12:02 surendra-mt

@quanghgx Hi I am facing the similar issue with spark 3.1.2 , scala 2.12 , java 11 , the query is simply reading data from snowflake and writtng back as a excel report , no fancy stuff or calculation , the v2 version also failed

regular version error

2022-05-29T02:01:46.514921848Z Exception in thread "main" shadeio.poi.ss.formula.FormulaParseException: Parse error near char 0 ' ' in specified formula ''. Expected cell ref or constant literal 2022-05-29T02:01:46.515185671Z at shadeio.poi.ss.formula.FormulaParser.expected(FormulaParser.java:272) 2022-05-29T02:01:46.515195984Z at shadeio.poi.ss.formula.FormulaParser.parseSimpleFactor(FormulaParser.java:1558) 2022-05-29T02:01:46.515200544Z at shadeio.poi.ss.formula.FormulaParser.percentFactor(FormulaParser.java:1511) 2022-05-29T02:01:46.515204460Z at shadeio.poi.ss.formula.FormulaParser.powerFactor(FormulaParser.java:1498) 2022-05-29T02:01:46.515208495Z at shadeio.poi.ss.formula.FormulaParser.Term(FormulaParser.java:1872) 2022-05-29T02:01:46.515212121Z at shadeio.poi.ss.formula.FormulaParser.additiveExpression(FormulaParser.java:1999) 2022-05-29T02:01:46.515215958Z at shadeio.poi.ss.formula.FormulaParser.concatExpression(FormulaParser.java:1983) 2022-05-29T02:01:46.515219717Z at shadeio.poi.ss.formula.FormulaParser.comparisonExpression(FormulaParser.java:1940) 2022-05-29T02:01:46.515223706Z at shadeio.poi.ss.formula.FormulaParser.intersectionExpression(FormulaParser.java:1913) 2022-05-29T02:01:46.515236277Z at shadeio.poi.ss.formula.FormulaParser.unionExpression(FormulaParser.java:1894) 2022-05-29T02:01:46.515243429Z at shadeio.poi.ss.formula.FormulaParser.parse(FormulaParser.java:2041) 2022-05-29T02:01:46.515247654Z at shadeio.poi.ss.formula.FormulaParser.parse(FormulaParser.java:173) 2022-05-29T02:01:46.515251274Z at shadeio.poi.xssf.usermodel.XSSFCell.setFormula(XSSFCell.java:480) 2022-05-29T02:01:46.515257312Z at shadeio.poi.xssf.usermodel.XSSFCell.setCellFormulaImpl(XSSFCell.java:460) 2022-05-29T02:01:46.515260998Z at shadeio.poi.ss.usermodel.CellBase.setCellFormula(CellBase.java:124) 2022-05-29T02:01:46.515264527Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.convertCell(Model2XlsxConversions.scala:49) 2022-05-29T02:01:46.515268517Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.$anonfun$convertRow$6(Model2XlsxConversions.scala:143) 2022-05-29T02:01:46.515307587Z at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) 2022-05-29T02:01:46.515312799Z at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) 2022-05-29T02:01:46.515317106Z at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) 2022-05-29T02:01:46.515327768Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.convertRow(Model2XlsxConversions.scala:143) 2022-05-29T02:01:46.515334415Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.$anonfun$writeToExistingSheet$2(Model2XlsxConversions.scala:156) 2022-05-29T02:01:46.515338305Z at scala.collection.immutable.List.foreach(List.scala:431) 2022-05-29T02:01:46.515341243Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.writeToExistingSheet(Model2XlsxConversions.scala:156) 2022-05-29T02:01:46.515344824Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.$anonfun$writeToExistingWorkbook$1(Model2XlsxConversions.scala:323) 2022-05-29T02:01:46.515347795Z at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36) 2022-05-29T02:01:46.515356669Z at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33) 2022-05-29T02:01:46.515363574Z at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38) 2022-05-29T02:01:46.515370705Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$.writeToExistingWorkbook(Model2XlsxConversions.scala:320) 2022-05-29T02:01:46.515378848Z at shadeio.spoiwo.natures.xlsx.Model2XlsxConversions$XlsxWorkbook.writeToExisting(Model2XlsxConversions.scala:420) 2022-05-29T02:01:46.515400467Z at com.crealytics.spark.excel.ExcelFileSaver.writeToWorkbook$1(ExcelFileSaver.scala:37) 2022-05-29T02:01:46.515435730Z at com.crealytics.spark.excel.ExcelFileSaver.save(ExcelFileSaver.scala:45) 2022-05-29T02:01:46.515442388Z at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:54) 2022-05-29T02:01:46.515446291Z at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)

tufanrakshit avatar May 29 '22 02:05 tufanrakshit