koalas
koalas copied to clipboard
pyspark dataframe coverting to koalas dataframe have different elements
Code:
import databricks.koalas as ks
from pyspark import SparkConf
from pyspark.sql import SparkSession
if __name__ == '__main__':
conf = SparkConf().setAppName("test")
spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
sdf = spark.sql("select uid,vr_id, gender, follow_count, byfollow_count, is_click "
"from database.table where data_date=20220726 "
"and uid=249462081764458496 limit 5")
sdf.show(n=20)
print("=======================to_koalas===============================")
df = sdf.to_koalas()
category_features_df = df[["uid", "vr_id", "gender"]].fillna(0)
dense_features_df = df[["follow_count", "byfollow_count"]].fillna(0)
y = df["is_click"].values
print("category_features_df: {}".format(category_features_df))
print("dense_features_df: {}".format(dense_features_df))
total_uids = category_features_df["uid"].unique().tolist()
total_vids = category_features_df["vr_id"].unique().tolist()
uid_id2index = {uid: i for i, uid in enumerate(total_uids)}
uid_index2id = {i: uid for uid, i in uid_id2index.items()}
vid_id2index = {vid: i for i, vid in enumerate(total_vids)}
vid_index2id = {i: vid for vid, i in vid_id2index.items()}
print(f"uid_id2index: {uid_id2index}")
print(f"vid_id2index: {vid_id2index}")
The result:
+------------------+------------------+------+------------+--------------+--------+
| uid| vr_id|gender|follow_count|byfollow_count|is_click|
+------------------+------------------+------+------------+--------------+--------+
|249462081764458496|234389742446182400| 0| 4| 2| 0|
|249462081764458496|247965851351777280| 0| 4| 2| 0|
|249462081764458496|303938736226304000| 0| 4| 2| 0|
|249462081764458496|305220054218178560| 0| 4| 2| 0|
|249462081764458496|150357127037190144| 0| 4| 2| 0|
+------------------+------------------+------+------------+--------------+--------+
=======================to_koalas===============================
/mnt/softwares/my_env/lib/python3.6/site-packages/databricks/koalas/generic.py:603: UserWarning: We recommend using `Series.to_numpy()` instead.
warnings.warn("We recommend using `{}.to_numpy()` instead.".format(type(self).__name__))
category_features_df: uid vr_id gender
0 249462081764458496 239951849459810304 0
1 249462081764458496 218479966654824448 0
2 249462081764458496 269598027864342528 0
3 249462081764458496 306587488548290560 0
4 249462081764458496 270454206781980672 0
dense_features_df: follow_count byfollow_count
0 4 2
1 4 2
2 4 2
3 4 2
4 4 2
uid_id2index: {249462081764458496: 0}
vid_id2index: {298760687402876928: 0, 306851269564170240: 1, 306601561927188480: 2, 269902057735979008: 3, 286263993075499008: 4}
why sdf is different with df?
import databricks.koalas as ks
from pyspark import SparkConf
from pyspark.sql import SparkSession
if __name__ == '__main__':
conf = SparkConf().setAppName("test")
spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
sdf = spark.sql("select uid,vr_id, gender, follow_count, byfollow_count, is_click "
"from database.table where data_date=20220726 "
"and uid=249462081764458496 limit 5")
sdf.show(n=20)
print("=======================to_koalas===============================")
df = sdf.to_koalas()
# Explicitly cast columns to the desired data types
df["uid"] = df["uid"].astype("int64")
df["vr_id"] = df["vr_id"].astype("int64")
df["gender"] = df["gender"].astype("int32")
df["follow_count"] = df["follow_count"].astype("int32")
df["byfollow_count"] = df["byfollow_count"].astype("int32")
df["is_click"] = df["is_click"].astype("int32")
category_features_df = df[["uid", "vr_id", "gender"]].fillna(0)
dense_features_df = df[["follow_count", "byfollow_count"]].fillna(0)
y = df["is_click"].values
print("category_features_df: {}".format(category_features_df))
print("dense_features_df: {}".format(dense_features_df))
total_uids = category_features_df["uid"].unique().tolist()
total_vids = category_features_df["vr_id"].unique().tolist()
uid_id2index = {uid: i for i, uid in enumerate(total_uids)}
uid_index2id = {i: uid for uid, i in uid_id2index.items()}
vid_id2index = {vid: i for i, vid in enumerate(total_vids)}
vid_index2id = {i: vid for vid, i in vid_id2index.items()}
print(f"uid_id2index: {uid_id2index}")
print(f"vid_id2index: {vid_id2index}")
can I take it ?
@tsafacjo could you open a ticket on Spark JIRA and made a fix for pyspark.pandas instead of Koalas? This repository is no longer maintained since Koalas has been migrated into Apache Spark.
ok, thanks @itholic
np! please feel free to ping me if you want to any help for contributing Apache Spark.