spline-spark-agent
spline-spark-agent copied to clipboard
Checkpoint lineage support
code like this, while spline cannot get Input Data Source
from pyspark.sql import SparkSession
spark = (SparkSession.builder
.config('spark.sql.queryExecutionListeners', 'za.co.absa.spline.harvester.listener.SplineQueryExecutionListener')
.config('spark.spline.producer.url', 'http://master-1-1:8080/producer')
.enableHiveSupport()
.getOrCreate()
)
def generate_data():
data = [
('a', 1),
('b', 2),
]
df = spark.createDataFrame(data, ['name', 'value'])
df.write.saveAsTable('test.table', mode='overwrite')
def test():
spark.sparkContext.setCheckpointDir('/tmp/checkpoint')
df = spark.table('test.table')
df = df.checkpoint() ## checkpoint
df.write.saveAsTable('test.table2', mode='overwrite')
if __name__ == '__main__':
# generate_data()
test()
+1 we would definitely appreciate this feature too (either way we're loosing a whole chunk of lineage)