Edit on GitHub
sqlmesh.core.engine_adapter.spark

View Source
  1from __future__ import annotations
  2
  3import logging
  4import typing as t
  5from functools import partial
  6
  7from sqlglot import exp
  8
  9from sqlmesh.core.dialect import to_schema
 10from sqlmesh.core.engine_adapter.mixins import (
 11    GetCurrentCatalogFromFunctionMixin,
 12    HiveMetastoreTablePropertiesMixin,
 13    RowDiffMixin,
 14)
 15from sqlmesh.core.engine_adapter.shared import (
 16    CatalogSupport,
 17    CommentCreationTable,
 18    CommentCreationView,
 19    DataObject,
 20    DataObjectType,
 21    InsertOverwriteStrategy,
 22    SourceQuery,
 23    set_catalog,
 24)
 25from sqlmesh.utils import classproperty, get_source_columns_to_types
 26from sqlmesh.utils.errors import SQLMeshError
 27
 28if t.TYPE_CHECKING:
 29    import pandas as pd
 30    from pyspark.sql import types as spark_types
 31
 32    from sqlmesh.core._typing import SchemaName, TableName
 33    from sqlmesh.core.engine_adapter._typing import (
 34        DF,
 35        PySparkDataFrame,
 36        PySparkSession,
 37        Query,
 38    )
 39    from sqlmesh.core.engine_adapter.base import QueryOrDF
 40    from sqlmesh.engines.spark.db_api.spark_session import SparkSessionConnection
 41
 42
 43logger = logging.getLogger(__name__)
 44
 45
 46@set_catalog()
 47class SparkEngineAdapter(
 48    GetCurrentCatalogFromFunctionMixin, HiveMetastoreTablePropertiesMixin, RowDiffMixin
 49):
 50    DIALECT = "spark"
 51    SUPPORTS_TRANSACTIONS = False
 52    INSERT_OVERWRITE_STRATEGY = InsertOverwriteStrategy.INSERT_OVERWRITE
 53    COMMENT_CREATION_TABLE = CommentCreationTable.IN_SCHEMA_DEF_NO_CTAS
 54    COMMENT_CREATION_VIEW = CommentCreationView.IN_SCHEMA_DEF_NO_COMMANDS
 55    # Note: Some formats (like Delta and Iceberg) support REPLACE TABLE but since we don't
 56    # currently check for storage formats we say we don't support REPLACE TABLE
 57    SUPPORTS_REPLACE_TABLE = False
 58    QUOTE_IDENTIFIERS_IN_VIEWS = False
 59    SUPPORTED_DROP_CASCADE_OBJECT_KINDS = ["DATABASE", "SCHEMA"]
 60
 61    WAP_PREFIX = "wap_"
 62    BRANCH_PREFIX = "branch_"
 63    SCHEMA_DIFFER_KWARGS = {
 64        "parameterized_type_defaults": {
 65            # default decimal precision varies across backends
 66            exp.DataType.build("DECIMAL", dialect=DIALECT).this: [(), (0,)],
 67        },
 68    }
 69
 70    @property
 71    def connection(self) -> SparkSessionConnection:
 72        return self._connection_pool.get()
 73
 74    @property
 75    def spark(self) -> PySparkSession:
 76        return self.connection.spark
 77
 78    @property
 79    def _use_spark_session(self) -> bool:
 80        return True
 81
 82    @property
 83    def use_serverless(self) -> bool:
 84        return False
 85
 86    @property
 87    def catalog_support(self) -> CatalogSupport:
 88        return CatalogSupport.FULL_SUPPORT
 89
 90    @classproperty
 91    def _sqlglot_to_spark_primitive_mapping(self) -> t.Dict[t.Any, t.Any]:
 92        from pyspark.sql import types as spark_types
 93
 94        return {
 95            exp.DataType.Type.TINYINT: spark_types.ByteType,
 96            exp.DataType.Type.SMALLINT: spark_types.ShortType,
 97            exp.DataType.Type.INT: spark_types.IntegerType,
 98            exp.DataType.Type.BIGINT: spark_types.LongType,
 99            exp.DataType.Type.FLOAT: spark_types.FloatType,
100            exp.DataType.Type.DOUBLE: spark_types.DoubleType,
101            exp.DataType.Type.DECIMAL: spark_types.DecimalType,
102            exp.DataType.Type.VARCHAR: spark_types.StringType,
103            exp.DataType.Type.CHAR: spark_types.StringType,
104            exp.DataType.Type.TEXT: spark_types.StringType,
105            exp.DataType.Type.BINARY: spark_types.BinaryType,
106            exp.DataType.Type.BOOLEAN: spark_types.BooleanType,
107            exp.DataType.Type.DATE: spark_types.DateType,
108            exp.DataType.Type.TIMESTAMPNTZ: spark_types.TimestampNTZType,
109            exp.DataType.Type.DATETIME: spark_types.TimestampNTZType,
110            exp.DataType.Type.TIMESTAMPLTZ: spark_types.TimestampType,
111            exp.DataType.Type.TIMESTAMP: spark_types.TimestampType,
112            exp.DataType.Type.TIMESTAMPTZ: spark_types.TimestampType,
113        }
114
115    @classproperty
116    def _sqlglot_to_spark_complex_mapping(self) -> t.Dict[t.Any, t.Any]:
117        from pyspark.sql import types as spark_types
118
119        return {
120            exp.DataType.Type.ARRAY: spark_types.ArrayType,
121            exp.DataType.Type.MAP: spark_types.MapType,
122            exp.DataType.Type.STRUCT: spark_types.StructType,
123        }
124
125    @classproperty
126    def _spark_to_sqlglot_primitive_mapping(self) -> t.Dict[t.Any, t.Any]:
127        return {v: k for k, v in self._sqlglot_to_spark_primitive_mapping.items()}
128
129    @classproperty
130    def _spark_to_sqlglot_complex_mapping(self) -> t.Dict[t.Any, t.Any]:
131        return {v: k for k, v in self._sqlglot_to_spark_complex_mapping.items()}
132
133    @classmethod
134    def spark_to_sqlglot_types(cls, input: spark_types.StructType) -> t.Dict[str, exp.DataType]:
135        from pyspark.sql import types as spark_types
136
137        def spark_complex_to_sqlglot_complex(
138            complex_type: t.Union[
139                spark_types.StructType, spark_types.ArrayType, spark_types.MapType
140            ],
141        ) -> exp.DataType:
142            def get_fields(
143                complex_type: t.Union[
144                    spark_types.StructType, spark_types.ArrayType, spark_types.MapType
145                ],
146            ) -> t.Sequence[spark_types.DataType]:
147                if isinstance(complex_type, spark_types.StructType):
148                    return complex_type.fields
149                if isinstance(complex_type, spark_types.ArrayType):
150                    return [complex_type.elementType]
151                if isinstance(complex_type, spark_types.MapType):
152                    return [complex_type.keyType, complex_type.valueType]
153                raise SQLMeshError(f"Unsupported complex type: {complex_type}")
154
155            expressions: t.List[t.Union[exp.ColumnDef, exp.DataType]] = []
156            fields = get_fields(complex_type)
157            for field in fields:
158                if isinstance(field, (spark_types.StructType, spark_types.MapType)):
159                    expressions.append(spark_complex_to_sqlglot_complex(field))
160                elif isinstance(field, spark_types.StructField):
161                    sqlglot_data_type = cls._spark_to_sqlglot_primitive_mapping.get(
162                        type(field.dataType)
163                    ) or spark_complex_to_sqlglot_complex(
164                        field.dataType  # type: ignore
165                    )
166                    kind = (
167                        sqlglot_data_type
168                        if isinstance(sqlglot_data_type, exp.DataType)
169                        else exp.DataType(this=sqlglot_data_type)
170                    )
171                    expressions.append(exp.ColumnDef(this=exp.to_identifier(field.name), kind=kind))
172                else:
173                    kind = exp.DataType(this=cls._spark_to_sqlglot_primitive_mapping[type(field)])
174                    expressions.append(kind)
175            dtype = cls._spark_to_sqlglot_complex_mapping[type(complex_type)]
176            return exp.DataType(
177                this=dtype,
178                expressions=expressions,
179                nested=True,
180            )
181
182        resp = spark_complex_to_sqlglot_complex(input)
183        return {column_def.this.name: column_def.args["kind"] for column_def in resp.expressions}
184
185    @classmethod
186    def sqlglot_to_spark_types(cls, input: t.Dict[str, exp.DataType]) -> spark_types.StructType:
187        from pyspark.sql import types as spark_types
188
189        def sqlglot_complex_to_spark_complex(complex_type: exp.DataType) -> spark_types.DataType:
190            is_struct = complex_type.is_type(exp.DataType.Type.STRUCT)
191            expressions = []
192            for column_def in complex_type.expressions:
193                col_name = column_def.this.name if is_struct else None
194                data_type = column_def.args["kind"] if is_struct else column_def
195                primitive_func = cls._sqlglot_to_spark_primitive_mapping.get(data_type.this)
196                type_func = (
197                    primitive_func
198                    if primitive_func
199                    else partial(sqlglot_complex_to_spark_complex, data_type)
200                )
201                if is_struct:
202                    expressions.append(spark_types.StructField(col_name, type_func()))  # type: ignore
203                else:
204                    expressions.append(type_func())  # type: ignore
205            klass = cls._sqlglot_to_spark_complex_mapping[complex_type.this]
206            if is_struct:
207                return klass(expressions)
208            return klass(*expressions)
209
210        return t.cast(
211            spark_types.StructType,
212            sqlglot_complex_to_spark_complex(
213                exp.DataType(
214                    this=exp.DataType.Type.STRUCT,
215                    expressions=[
216                        exp.ColumnDef(this=exp.to_identifier(column), kind=data_type)
217                        for column, data_type in input.items()
218                    ],
219                )
220            ),
221        )
222
223    @classmethod
224    def is_pyspark_df(cls, value: t.Any) -> bool:
225        return hasattr(value, "sparkSession")
226
227    @classmethod
228    def try_get_pyspark_df(cls, value: t.Any) -> t.Optional[PySparkDataFrame]:
229        if cls.is_pyspark_df(value):
230            return value
231        return None
232
233    @classmethod
234    def try_get_pandas_df(cls, value: t.Any) -> t.Optional[pd.DataFrame]:
235        import pandas as pd
236
237        if isinstance(value, pd.DataFrame):
238            return value
239        return None
240
241    @t.overload
242    def _columns_to_types(
243        self,
244        query_or_df: DF,
245        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
246        source_columns: t.Optional[t.List[str]] = None,
247    ) -> t.Tuple[t.Dict[str, exp.DataType], t.List[str]]: ...
248
249    @t.overload
250    def _columns_to_types(
251        self,
252        query_or_df: Query,
253        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
254        source_columns: t.Optional[t.List[str]] = None,
255    ) -> t.Tuple[t.Optional[t.Dict[str, exp.DataType]], t.Optional[t.List[str]]]: ...
256
257    def _columns_to_types(
258        self,
259        query_or_df: QueryOrDF,
260        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
261        source_columns: t.Optional[t.List[str]] = None,
262    ) -> t.Tuple[t.Optional[t.Dict[str, exp.DataType]], t.Optional[t.List[str]]]:
263        if target_columns_to_types:
264            return target_columns_to_types, list(source_columns or target_columns_to_types)
265        if self.is_pyspark_df(query_or_df):
266            from pyspark.sql import DataFrame
267
268            target_columns_to_types = self.spark_to_sqlglot_types(
269                t.cast(DataFrame, query_or_df).schema
270            )
271            return target_columns_to_types, list(source_columns or target_columns_to_types)
272        return super()._columns_to_types(
273            query_or_df, target_columns_to_types, source_columns=source_columns
274        )
275
276    def _df_to_source_queries(
277        self,
278        df: DF,
279        target_columns_to_types: t.Dict[str, exp.DataType],
280        batch_size: int,
281        target_table: TableName,
282        source_columns: t.Optional[t.List[str]] = None,
283    ) -> t.List[SourceQuery]:
284        df = self._ensure_pyspark_df(df, target_columns_to_types, source_columns=source_columns)
285
286        def query_factory() -> Query:
287            temp_table = self._get_temp_table(target_table or "spark", table_only=True)
288            df.createOrReplaceGlobalTempView(temp_table.sql(dialect=self.dialect))  # type: ignore
289            temp_table.set("db", "global_temp")
290            return exp.select(*self._select_columns(target_columns_to_types)).from_(temp_table)
291
292        return [SourceQuery(query_factory=query_factory)]
293
294    def _ensure_pyspark_df(
295        self,
296        generic_df: DF,
297        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
298        source_columns: t.Optional[t.List[str]] = None,
299    ) -> PySparkDataFrame:
300        pyspark_df = self.try_get_pyspark_df(generic_df)
301        if not pyspark_df:
302            df = self.try_get_pandas_df(generic_df)
303            if df is None:
304                raise SQLMeshError(
305                    "Ensure PySpark DF can only be run on a PySpark or Pandas DataFrame"
306                )
307
308            if target_columns_to_types:
309                source_columns_to_types = get_source_columns_to_types(
310                    target_columns_to_types, source_columns
311                )
312                # ensure Pandas dataframe column order matches columns_to_types
313                df = df[list(source_columns_to_types)]
314            else:
315                source_columns_to_types = None
316            kwargs = (
317                dict(schema=self.sqlglot_to_spark_types(source_columns_to_types))
318                if source_columns_to_types
319                else {}
320            )
321            pyspark_df = self.spark.createDataFrame(df, **kwargs)  # type: ignore
322        if target_columns_to_types:
323            select_columns = self._casted_columns(
324                target_columns_to_types, source_columns=source_columns
325            )
326            pyspark_df = pyspark_df.selectExpr(*[x.sql(self.dialect) for x in select_columns])  # type: ignore
327        return pyspark_df
328
329    def _get_temp_table(
330        self, table: TableName, table_only: bool = False, quoted: bool = True
331    ) -> exp.Table:
332        """
333        Returns the name of the temp table that should be used for the given table name.
334        """
335        table = super()._get_temp_table(table, table_only=table_only)
336        table_name_id = table.args["this"]
337        # Spark with local filesystem has an issue with temp tables that start with __temp so
338        # we update here to remove the leading double underscore
339        table_name_id.set("this", table_name_id.this.replace("__temp_", "temp_"))
340        return table
341
342    def fetchdf(
343        self, query: t.Union[exp.Expr, str], quote_identifiers: bool = False
344    ) -> pd.DataFrame:
345        return self.fetch_pyspark_df(query, quote_identifiers=quote_identifiers).toPandas()
346
347    def fetch_pyspark_df(
348        self, query: t.Union[exp.Expr, str], quote_identifiers: bool = False
349    ) -> PySparkDataFrame:
350        return self._ensure_pyspark_df(
351            self._fetch_native_df(query, quote_identifiers=quote_identifiers)
352        )
353
354    def _get_data_objects(
355        self, schema_name: SchemaName, object_names: t.Optional[t.Set[str]] = None
356    ) -> t.List[DataObject]:
357        schema_name = to_schema(schema_name).sql(dialect=self.dialect)
358        pattern = "*" if object_names is None else "|".join(object_names)
359        sql = f"SHOW TABLE EXTENDED IN {schema_name} LIKE '{pattern}'"
360        try:
361            results = (
362                self.fetch_pyspark_df(sql).collect()
363                if self._use_spark_session
364                else self.fetchdf(sql).to_dict("records")
365            )
366        # Improvement: Figure out all the different exceptions we could get from executing a query either with or
367        # without a Spark Session. In addition Databricks would need to be updated to handle it's own exceptions.
368        # Therefore just doing except Exception for now.
369        except Exception:
370            return []
371        data_objects = []
372        catalog = self.get_current_catalog()
373        for row in results:  # type: ignore
374            row_dict = row.asDict() if not isinstance(row, dict) else row
375            if row_dict.get("isTemporary"):
376                continue
377            schema = row_dict.get("namespace") or row_dict.get("database")
378            data_objects.append(
379                DataObject(
380                    catalog=catalog,
381                    schema=schema,
382                    name=row_dict["tableName"],
383                    type=(
384                        DataObjectType.VIEW
385                        if "Type: VIEW" in row_dict["information"]
386                        else DataObjectType.TABLE
387                    ),
388                )
389            )
390        return data_objects
391
392    def get_current_catalog(self) -> t.Optional[str]:
393        if self._use_spark_session:
394            return self.connection.get_current_catalog()
395        return super().get_current_catalog()
396
397    def set_current_catalog(self, catalog_name: str) -> None:
398        self.connection.set_current_catalog(catalog_name)
399
400    def _get_current_schema(self) -> str:
401        if self._use_spark_session:
402            return self.spark.catalog.currentDatabase()
403        return self.fetchone(exp.select(exp.func("current_database")))[0]  # type: ignore
404
405    def get_data_object(
406        self, target_name: TableName, safe_to_cache: bool = False
407    ) -> t.Optional[DataObject]:
408        target_table = exp.to_table(target_name)
409        if isinstance(target_table.this, exp.Dot) and target_table.this.expression.name.startswith(
410            f"{self.BRANCH_PREFIX}{self.WAP_PREFIX}"
411        ):
412            # Exclude the branch name
413            target_table.set("this", target_table.this.this)
414        return super().get_data_object(target_table, safe_to_cache=safe_to_cache)
415
416    def create_state_table(
417        self,
418        table_name: str,
419        target_columns_to_types: t.Dict[str, exp.DataType],
420        primary_key: t.Optional[t.Tuple[str, ...]] = None,
421    ) -> None:
422        self.create_table(
423            table_name,
424            target_columns_to_types,
425            partitioned_by=[exp.column(x) for x in primary_key] if primary_key else None,
426        )
427
428    def _native_df_to_pandas_df(
429        self,
430        query_or_df: QueryOrDF,
431    ) -> t.Union[Query, pd.DataFrame]:
432        if pyspark_df := self.try_get_pyspark_df(query_or_df):
433            return pyspark_df.toPandas()
434
435        return super()._native_df_to_pandas_df(query_or_df)
436
437    def _create_table(
438        self,
439        table_name_or_schema: t.Union[exp.Schema, TableName],
440        expression: t.Optional[exp.Expr],
441        exists: bool = True,
442        replace: bool = False,
443        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
444        table_description: t.Optional[str] = None,
445        column_descriptions: t.Optional[t.Dict[str, str]] = None,
446        table_kind: t.Optional[str] = None,
447        track_rows_processed: bool = True,
448        **kwargs: t.Any,
449    ) -> None:
450        table_name = (
451            table_name_or_schema.this
452            if isinstance(table_name_or_schema, exp.Schema)
453            else exp.to_table(table_name_or_schema)
454        )
455        # Spark doesn't support creating a wap table DDL. Therefore we check if this is a wap table and if it is,
456        # this is not a replace, and the table already exists then we can safely just return. Otherwise we let it error.
457        if not expression and isinstance(table_name.this, exp.Dot):
458            wap_id = table_name.this.parts[-1].name
459            if wap_id.startswith(f"{self.BRANCH_PREFIX}{self.WAP_PREFIX}"):
460                table_name.set("this", table_name.this.this)
461
462        do_dummy_insert = False
463        if self.wap_enabled:
464            wap_supported = (
465                kwargs.get("storage_format") or ""
466            ).lower() == "iceberg" or self.wap_supported(table_name)
467            do_dummy_insert = (
468                False if not wap_supported or not exists else not self.table_exists(table_name)
469            )
470        super()._create_table(
471            table_name_or_schema,
472            expression,
473            exists=exists,
474            replace=replace,
475            target_columns_to_types=target_columns_to_types,
476            table_description=table_description,
477            column_descriptions=column_descriptions,
478            track_rows_processed=track_rows_processed,
479            **kwargs,
480        )
481        table_name = (
482            table_name_or_schema.this
483            if isinstance(table_name_or_schema, exp.Schema)
484            else exp.to_table(table_name_or_schema)
485        )
486        if do_dummy_insert:
487            # Performing a dummy insert to create a dummy snapshot for Iceberg tables
488            # to workaround https://github.com/apache/iceberg/issues/8849.
489            dummy_insert = exp.insert(exp.select("*").from_(table_name), table_name)
490            self.execute(dummy_insert)
491
492    def wap_supported(self, table_name: TableName) -> bool:
493        fqn = self._ensure_fqn(table_name)
494        return (
495            self.spark.conf.get(f"spark.sql.catalog.{fqn.catalog}")
496            == "org.apache.iceberg.spark.SparkCatalog"
497        )
498
499    def wap_table_name(self, table_name: TableName, wap_id: str) -> str:
500        branch_name = self._wap_branch_name(wap_id)
501        fqn = self._ensure_fqn(table_name)
502        return exp.Dot.build([fqn, exp.to_identifier(f"{self.BRANCH_PREFIX}{branch_name}")]).sql(
503            dialect=self.dialect
504        )
505
506    def wap_prepare(self, table_name: TableName, wap_id: str) -> str:
507        branch_name = self._wap_branch_name(wap_id)
508        fqn = self._ensure_fqn(table_name)
509        self.execute(f"ALTER TABLE {fqn.sql(dialect=self.dialect)} CREATE BRANCH {branch_name}")
510        return self.wap_table_name(table_name, wap_id)
511
512    def wap_publish(self, table_name: TableName, wap_id: str) -> None:
513        branch_name = self._wap_branch_name(wap_id)
514        fqn = self._ensure_fqn(table_name)
515
516        get_snapshot_id_query = (
517            exp.select("snapshot_id")
518            .from_(exp.Dot.build([fqn, exp.to_identifier("refs")]))
519            .where(exp.column("name").eq(branch_name))
520        )
521        iceberg_snapshot_ids = self.fetchall(get_snapshot_id_query)
522        if not iceberg_snapshot_ids:
523            raise SQLMeshError(f"Could not find Iceberg branch '{branch_name}'.")
524        iceberg_snapshot_id = iceberg_snapshot_ids[0][0]
525
526        logger.info(
527            "Cherry-picking Iceberg snapshot %s into table '%s'...", iceberg_snapshot_id, fqn
528        )
529
530        self.execute(
531            f"CALL {fqn.catalog}.system.cherrypick_snapshot('{fqn.db}.{fqn.name}', {iceberg_snapshot_id})"
532        )
533        self.execute(f"ALTER TABLE {fqn.sql(dialect=self.dialect)} DROP BRANCH {branch_name}")
534
535    def _ensure_fqn(self, table_name: TableName) -> exp.Table:
536        if isinstance(table_name, exp.Table):
537            table_name = table_name.copy()
538        table = exp.to_table(table_name, dialect=self.dialect)
539        if not table.catalog:
540            table.set("catalog", self.get_current_catalog())
541        if not table.db:
542            table.set("db", self._get_current_schema())
543        return table
544
545    def _build_create_comment_column_exp(
546        self, table: exp.Table, column_name: str, column_comment: str, table_kind: str = "TABLE"
547    ) -> exp.Comment | str:
548        table_sql = table.sql(dialect=self.dialect, identify=True)
549        column_sql = exp.column(column_name).sql(dialect=self.dialect, identify=True)
550
551        truncated_comment = self._truncate_column_comment(column_comment)
552        comment_sql = exp.Literal.string(truncated_comment).sql(dialect=self.dialect)
553
554        return f"ALTER TABLE {table_sql} ALTER COLUMN {column_sql} COMMENT {comment_sql}"
555
556    @classmethod
557    def _wap_branch_name(cls, wap_id: str) -> str:
558        return f"{cls.WAP_PREFIX}{wap_id}"
logger = <Logger sqlmesh.core.engine_adapter.spark (WARNING)>
sqlmesh.core.engine_adapter.spark

Arguments:

Arguments:

Arguments:

Returns:

Arguments:

Returns:

Arguments:

Inherited Members