add spark optim

aphp · Vincent-Maladiere · Mar 10, 2023 · Mar 9, 2023 · Mar 9, 2023 · 9a64218c3cabf779baa9f649c517aba868f5c745
commit 9a64218c3cabf779baa9f649c517aba868f5c745
diff --git a/eds_scikit/__init__.py b/eds_scikit/__init__.py
@@ -57,6 +57,7 @@ def koalas_options() -> None:
 
     ks.set_option("compute.default_index_type", "distributed")
     ks.set_option("compute.ops_on_diff_frames", True)
+    ks.set_option("display.max_rows", 50)
 
 
 def set_env_variables() -> None:

diff --git a/eds_scikit/utils/custom_implem/custom_implem.py b/eds_scikit/utils/custom_implem/custom_implem.py
@@ -70,7 +70,11 @@ def cache(cls, df, backend=None):
             # no-op
             return
         elif backend is ks:
-            df.spark.cache()
+            # Cache using count(), a simple action that trigger the
+            # eager mode and effectively cache the dataframe.
+            # See this link for more details about the count trick:
+            # https://stackoverflow.com/a/44002485
+            df.spark.cache().count()
             return
         else:
             raise NotImplementedError(