Edit on GitHub
sqlmesh.core.engine_adapter.databricks

View Source
  1from __future__ import annotations
  2
  3import logging
  4import typing as t
  5from functools import partial
  6
  7from sqlglot import exp
  8
  9from sqlmesh.core.constants import LIQUID_CLUSTERING_KEYWORDS
 10from sqlmesh.core.dialect import to_schema
 11from sqlmesh.core.engine_adapter.mixins import GrantsFromInfoSchemaMixin
 12from sqlmesh.core.engine_adapter.shared import (
 13    CatalogSupport,
 14    DataObject,
 15    DataObjectType,
 16    InsertOverwriteStrategy,
 17    SourceQuery,
 18)
 19from sqlmesh.core.engine_adapter.spark import SparkEngineAdapter
 20from sqlmesh.core.node import IntervalUnit
 21from sqlmesh.core.schema_diff import NestedSupport
 22from sqlmesh.engines.spark.db_api.spark_session import connection, SparkSessionConnection
 23from sqlmesh.utils.errors import SQLMeshError, MissingDefaultCatalogError
 24
 25if t.TYPE_CHECKING:
 26    import pandas as pd
 27
 28    from sqlmesh.core._typing import SchemaName, TableName, SessionProperties
 29    from sqlmesh.core.engine_adapter._typing import DF, PySparkSession, Query
 30
 31logger = logging.getLogger(__name__)
 32
 33
 34def _query_tags(
 35    query_tags: t.Optional[t.Union[exp.Expr, str, int, float, bool]],
 36) -> t.Optional[t.Dict[str, t.Optional[str]]]:
 37    if not query_tags:
 38        return None
 39
 40    if not isinstance(query_tags, (exp.Map, exp.VarMap)):
 41        raise SQLMeshError("Invalid value for `session_properties.query_tags`. Must be a map.")
 42
 43    keys = query_tags.args.get("keys")
 44    values = query_tags.args.get("values")
 45    if not isinstance(keys, exp.Array) or not isinstance(values, exp.Array):
 46        raise SQLMeshError(
 47            "Invalid value for `session_properties.query_tags`. Must be a map with array "
 48            "keys and array values."
 49        )
 50
 51    tags: t.Dict[str, t.Optional[str]] = {}
 52    for key, value in zip(keys.expressions, values.expressions):
 53        if not isinstance(key, exp.Literal) or not key.is_string:
 54            raise SQLMeshError(
 55                "Invalid key in `session_properties.query_tags`. Keys must be string literals."
 56            )
 57
 58        if isinstance(value, exp.Null):
 59            tags[key.this] = None
 60        elif isinstance(value, exp.Literal) and value.is_string:
 61            tags[key.this] = value.this
 62        else:
 63            raise SQLMeshError(
 64                "Invalid value in `session_properties.query_tags`. Values must be string "
 65                "literals or NULL."
 66            )
 67
 68    return tags
 69
 70
 71class DatabricksEngineAdapter(SparkEngineAdapter, GrantsFromInfoSchemaMixin):
 72    DIALECT = "databricks"
 73    INSERT_OVERWRITE_STRATEGY = InsertOverwriteStrategy.REPLACE_WHERE
 74    SUPPORTS_CLONING = True
 75    SUPPORTS_MATERIALIZED_VIEWS = True
 76    SUPPORTS_MATERIALIZED_VIEW_SCHEMA = True
 77    SUPPORTS_GRANTS = True
 78    USE_CATALOG_IN_GRANTS = True
 79    # Spark has this set to false for compatibility when mixing with Trino but that isn't a concern with Databricks
 80    QUOTE_IDENTIFIERS_IN_VIEWS = True
 81    SCHEMA_DIFFER_KWARGS = {
 82        "support_positional_add": True,
 83        "nested_support": NestedSupport.ALL,
 84        "array_element_selector": "element",
 85        "parameterized_type_defaults": {
 86            exp.DataType.build("DECIMAL", dialect=DIALECT).this: [(10, 0), (0,)],
 87        },
 88    }
 89
 90    def __init__(self, *args: t.Any, **kwargs: t.Any) -> None:
 91        super().__init__(*args, **kwargs)
 92        self._set_spark_engine_adapter_if_needed()
 93
 94    @classmethod
 95    def can_access_spark_session(cls, disable_spark_session: bool) -> bool:
 96        from sqlmesh import RuntimeEnv
 97
 98        if disable_spark_session:
 99            return False
100
101        return RuntimeEnv.get().is_databricks
102
103    @classmethod
104    def can_access_databricks_connect(cls, disable_databricks_connect: bool) -> bool:
105        if disable_databricks_connect:
106            return False
107
108        try:
109            from databricks.connect import DatabricksSession  # noqa
110
111            return True
112        except ImportError:
113            return False
114
115    @property
116    def _use_spark_session(self) -> bool:
117        if self.can_access_spark_session(bool(self._extra_config.get("disable_spark_session"))):
118            return True
119
120        if self.can_access_databricks_connect(
121            bool(self._extra_config.get("disable_databricks_connect"))
122        ):
123            if self._extra_config.get("databricks_connect_use_serverless"):
124                return True
125
126            if {
127                "databricks_connect_cluster_id",
128                "databricks_connect_server_hostname",
129                "databricks_connect_access_token",
130            }.issubset(self._extra_config):
131                return True
132
133        return False
134
135    @property
136    def is_spark_session_connection(self) -> bool:
137        return isinstance(self.connection, SparkSessionConnection)
138
139    @property
140    def _is_databricks_sql_connector_connection(self) -> bool:
141        return not self.is_spark_session_connection and not self._connection_pool.get_attribute(
142            "use_spark_engine_adapter"
143        )
144
145    def _set_spark_engine_adapter_if_needed(self) -> None:
146        self._spark_engine_adapter = None
147
148        if not self._use_spark_session or self.is_spark_session_connection:
149            return
150
151        from databricks.connect import DatabricksSession
152
153        connect_kwargs = dict(
154            host=self._extra_config["databricks_connect_server_hostname"],
155            token=self._extra_config.get("databricks_connect_access_token"),
156        )
157        if self._extra_config.get("databricks_connect_use_serverless"):
158            connect_kwargs["serverless"] = True
159        else:
160            connect_kwargs["cluster_id"] = self._extra_config["databricks_connect_cluster_id"]
161
162        catalog = self._extra_config.get("catalog")
163        spark = (
164            DatabricksSession.builder.remote(**connect_kwargs).userAgent("sqlmesh").getOrCreate()
165        )
166        self._spark_engine_adapter = SparkEngineAdapter(
167            partial(connection, spark=spark, catalog=catalog),
168            default_catalog=catalog,
169            execute_log_level=self._execute_log_level,
170            multithreaded=self._multithreaded,
171            sql_gen_kwargs=self._sql_gen_kwargs,
172            register_comments=self._register_comments,
173            pre_ping=self._pre_ping,
174            pretty_sql=self._pretty_sql,
175        )
176
177    @property
178    def cursor(self) -> t.Any:
179        if (
180            self._connection_pool.get_attribute("use_spark_engine_adapter")
181            and not self.is_spark_session_connection
182        ):
183            return self._spark_engine_adapter.cursor  # type: ignore
184        return super().cursor
185
186    @property
187    def spark(self) -> PySparkSession:
188        if not self._use_spark_session:
189            raise SQLMeshError(
190                "SparkSession is not available. "
191                "Either run from a Databricks Notebook or "
192                "install `databricks-connect` and configure it to connect to your Databricks cluster."
193            )
194        if self.is_spark_session_connection:
195            return self.connection.spark
196        return self._spark_engine_adapter.spark  # type: ignore
197
198    @property
199    def catalog_support(self) -> CatalogSupport:
200        return CatalogSupport.FULL_SUPPORT
201
202    @staticmethod
203    def _grant_object_kind(table_type: DataObjectType) -> str:
204        if table_type == DataObjectType.VIEW:
205            return "VIEW"
206        if table_type == DataObjectType.MATERIALIZED_VIEW:
207            return "MATERIALIZED VIEW"
208        return "TABLE"
209
210    def _get_grant_expression(self, table: exp.Table) -> exp.Expr:
211        # We only care about explicitly granted privileges and not inherited ones
212        # if this is removed you would see grants inherited from the catalog get returned
213        expression = super()._get_grant_expression(table)
214        expression.args["where"].set(
215            "this",
216            exp.and_(
217                expression.args["where"].this,
218                exp.column("inherited_from").eq(exp.Literal.string("NONE")),
219                wrap=False,
220            ),
221        )
222        return expression
223
224    def _begin_session(self, properties: SessionProperties) -> t.Any:
225        """Begin a new session."""
226        # Align the different possible connectors to a single catalog
227        self.set_current_catalog(self.default_catalog)  # type: ignore
228        self._connection_pool.set_attribute("query_tags", _query_tags(properties.get("query_tags")))
229
230    def _end_session(self) -> None:
231        self._connection_pool.set_attribute("query_tags", None)
232        self._connection_pool.set_attribute("use_spark_engine_adapter", False)
233
234    def _execute(self, sql: str, track_rows_processed: bool = False, **kwargs: t.Any) -> None:
235        query_tags = self._connection_pool.get_attribute("query_tags")
236        if (
237            query_tags
238            and "query_tags" not in kwargs
239            and self._is_databricks_sql_connector_connection
240        ):
241            kwargs["query_tags"] = query_tags
242
243        return super()._execute(sql, track_rows_processed, **kwargs)
244
245    def _df_to_source_queries(
246        self,
247        df: DF,
248        target_columns_to_types: t.Dict[str, exp.DataType],
249        batch_size: int,
250        target_table: TableName,
251        source_columns: t.Optional[t.List[str]] = None,
252    ) -> t.List[SourceQuery]:
253        if not self._use_spark_session:
254            return super(SparkEngineAdapter, self)._df_to_source_queries(
255                df, target_columns_to_types, batch_size, target_table, source_columns=source_columns
256            )
257        pyspark_df = self._ensure_pyspark_df(
258            df, target_columns_to_types, source_columns=source_columns
259        )
260
261        def query_factory() -> Query:
262            temp_table = self._get_temp_table(target_table or "spark", table_only=True)
263            pyspark_df.createOrReplaceTempView(temp_table.sql(dialect=self.dialect))
264            self._connection_pool.set_attribute("use_spark_engine_adapter", True)
265            return exp.select(*self._select_columns(target_columns_to_types)).from_(temp_table)
266
267        return [SourceQuery(query_factory=query_factory)]
268
269    def _fetch_native_df(
270        self, query: t.Union[exp.Expr, str], quote_identifiers: bool = False
271    ) -> DF:
272        """Fetches a DataFrame that can be either Pandas or PySpark from the cursor"""
273        if self.is_spark_session_connection:
274            return super()._fetch_native_df(query, quote_identifiers=quote_identifiers)
275        if self._spark_engine_adapter:
276            return self._spark_engine_adapter._fetch_native_df(  # type: ignore
277                query, quote_identifiers=quote_identifiers
278            )
279        self.execute(query)
280        return self.cursor.fetchall_arrow().to_pandas()
281
282    def fetchdf(
283        self, query: t.Union[exp.Expr, str], quote_identifiers: bool = False
284    ) -> pd.DataFrame:
285        """
286        Returns a Pandas DataFrame from a query or expression.
287        """
288        import pandas as pd
289
290        df = self._fetch_native_df(query, quote_identifiers=quote_identifiers)
291        if not isinstance(df, pd.DataFrame):
292            return df.toPandas()
293        return df
294
295    def get_current_catalog(self) -> t.Optional[str]:
296        pyspark_catalog = None
297        sql_connector_catalog = None
298        if self._spark_engine_adapter:
299            from py4j.protocol import Py4JError
300            from pyspark.errors.exceptions.connect import SparkConnectGrpcException
301
302            try:
303                # Note: Spark 3.4+ Only API
304                pyspark_catalog = self._spark_engine_adapter.get_current_catalog()
305            except (Py4JError, SparkConnectGrpcException):
306                pass
307        elif self.is_spark_session_connection:
308            pyspark_catalog = self.connection.spark.catalog.currentCatalog()
309        if not self.is_spark_session_connection:
310            result = self.fetchone(exp.select(self.CURRENT_CATALOG_EXPRESSION))
311            sql_connector_catalog = result[0] if result else None
312        if self._spark_engine_adapter and pyspark_catalog != sql_connector_catalog:
313            logger.warning(
314                f"Current catalog mismatch between Databricks SQL Connector and Databricks-Connect: `{sql_connector_catalog}` != `{pyspark_catalog}`. Set `catalog` connection property to make them the same."
315            )
316        return pyspark_catalog or sql_connector_catalog
317
318    def set_current_catalog(self, catalog_name: str) -> None:
319        def _set_spark_session_current_catalog(spark: PySparkSession) -> None:
320            from py4j.protocol import Py4JError
321            from pyspark.errors.exceptions.connect import SparkConnectGrpcException
322
323            try:
324                # Note: Spark 3.4+ Only API
325                spark.catalog.setCurrentCatalog(catalog_name)
326            except (Py4JError, SparkConnectGrpcException):
327                pass
328
329        # Since Databricks splits commands across the Dataframe API and the SQL Connector
330        # (depending if databricks-connect is installed and a Dataframe is used) we need to ensure both
331        # are set to the same catalog since they maintain their default catalog separately
332        self.execute(exp.Use(this=exp.to_identifier(catalog_name), kind="CATALOG"))
333        if self.is_spark_session_connection:
334            _set_spark_session_current_catalog(self.connection.spark)
335
336        if self._spark_engine_adapter:
337            _set_spark_session_current_catalog(self._spark_engine_adapter.spark)
338
339    def _get_data_objects(
340        self, schema_name: SchemaName, object_names: t.Optional[t.Set[str]] = None
341    ) -> t.List[DataObject]:
342        """
343        Returns all the data objects that exist in the given schema and catalog.
344        """
345        schema = to_schema(schema_name)
346        catalog_name = schema.catalog or self.get_current_catalog()
347        query = (
348            exp.select(
349                exp.column("table_name").as_("name"),
350                exp.column("table_schema").as_("schema"),
351                exp.column("table_catalog").as_("catalog"),
352                exp.case(exp.column("table_type"))
353                .when(exp.Literal.string("VIEW"), exp.Literal.string("view"))
354                .when(
355                    exp.Literal.string("MATERIALIZED_VIEW"), exp.Literal.string("materialized_view")
356                )
357                .else_(exp.Literal.string("table"))
358                .as_("type"),
359            )
360            .from_(
361                # always query `system` information_schema
362                exp.table_("tables", "information_schema", "system")
363            )
364            .where(exp.column("table_catalog").eq(catalog_name))
365            .where(exp.column("table_schema").eq(schema.db))
366        )
367
368        if object_names:
369            query = query.where(exp.column("table_name").isin(*object_names))
370
371        df = self.fetchdf(query)
372        return [
373            DataObject(
374                catalog=row.catalog,  # type: ignore
375                schema=row.schema,  # type: ignore
376                name=row.name,  # type: ignore
377                type=DataObjectType.from_str(row.type),  # type: ignore
378            )
379            for row in df.itertuples()
380        ]
381
382    def clone_table(
383        self,
384        target_table_name: TableName,
385        source_table_name: TableName,
386        replace: bool = False,
387        exists: bool = True,
388        clone_kwargs: t.Optional[t.Dict[str, t.Any]] = None,
389        **kwargs: t.Any,
390    ) -> None:
391        clone_kwargs = clone_kwargs or {}
392        clone_kwargs["shallow"] = True
393        super().clone_table(
394            target_table_name,
395            source_table_name,
396            replace=replace,
397            clone_kwargs=clone_kwargs,
398            **kwargs,
399        )
400
401    def wap_supported(self, table_name: TableName) -> bool:
402        return False
403
404    def close(self) -> t.Any:
405        """Closes all open connections and releases all allocated resources."""
406        super().close()
407        if self._spark_engine_adapter:
408            self._spark_engine_adapter.close()
409
410    @property
411    def default_catalog(self) -> t.Optional[str]:
412        try:
413            return super().default_catalog
414        except MissingDefaultCatalogError as e:
415            raise MissingDefaultCatalogError(
416                "Could not determine default catalog. Define the connection property `catalog` since it can't be inferred from your connection. See SQLMesh Databricks documentation for details"
417            ) from e
418
419    def _build_table_properties_exp(
420        self,
421        catalog_name: t.Optional[str] = None,
422        table_format: t.Optional[str] = None,
423        storage_format: t.Optional[str] = None,
424        partitioned_by: t.Optional[t.List[exp.Expr]] = None,
425        partition_interval_unit: t.Optional[IntervalUnit] = None,
426        clustered_by: t.Optional[t.List[exp.Expr]] = None,
427        table_properties: t.Optional[t.Dict[str, exp.Expr]] = None,
428        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
429        table_description: t.Optional[str] = None,
430        table_kind: t.Optional[str] = None,
431        **kwargs: t.Any,
432    ) -> t.Optional[exp.Properties]:
433        properties = super()._build_table_properties_exp(
434            catalog_name=catalog_name,
435            table_format=table_format,
436            storage_format=storage_format,
437            partitioned_by=partitioned_by,
438            partition_interval_unit=partition_interval_unit,
439            clustered_by=clustered_by,
440            table_properties=table_properties,
441            target_columns_to_types=target_columns_to_types,
442            table_description=table_description,
443            table_kind=table_kind,
444        )
445        if clustered_by:
446            if len(clustered_by) == 1 and isinstance(clustered_by[0], exp.Var):
447                if clustered_by[0].name.upper() not in LIQUID_CLUSTERING_KEYWORDS:
448                    raise ValueError(f"Unexpected bare Var in clustered_by: {clustered_by[0]!r}")
449                # exp.Cluster with a bare Var generates: CLUSTER BY AUTO (no parens)
450                clustered_by_exp = exp.Cluster(expressions=[clustered_by[0].copy()])
451            else:
452                # Databricks expects column expressions wrapped in a tuple
453                clustered_by_exp = exp.Cluster(
454                    expressions=[exp.Tuple(expressions=[c.copy() for c in clustered_by])]
455                )
456            expressions = properties.expressions if properties else []
457            expressions.append(clustered_by_exp)
458            properties = exp.Properties(expressions=expressions)
459        return properties
460
461    def _build_column_defs(
462        self,
463        target_columns_to_types: t.Dict[str, exp.DataType],
464        column_descriptions: t.Optional[t.Dict[str, str]] = None,
465        is_view: bool = False,
466        materialized: bool = False,
467    ) -> t.List[exp.ColumnDef]:
468        # Databricks requires column types to be specified when adding column comments
469        # in CREATE MATERIALIZED VIEW statements. Override is_view to False to force
470        # column types to be included when comments are present.
471        if is_view and materialized and column_descriptions:
472            is_view = False
473
474        return super()._build_column_defs(
475            target_columns_to_types, column_descriptions, is_view, materialized
476        )
477
478    def columns(
479        self, table_name: TableName, include_pseudo_columns: bool = False
480    ) -> t.Dict[str, exp.DataType]:
481        table = exp.to_table(table_name)
482
483        column_catalog = table.catalog or self.get_current_catalog()
484        query = (
485            exp.select("columns.column_name", "columns.full_data_type")
486            .from_("system.information_schema.columns")
487            .where(
488                exp.and_(
489                    exp.column("table_name").eq(table.name),
490                    exp.column("table_schema").eq(table.db),
491                    exp.column("table_catalog").eq(column_catalog),
492                )
493            )
494            .order_by("ordinal_position ASC")
495        )
496
497        self.execute(query.sql(dialect=self.dialect))
498        result = self.cursor.fetchall()
499
500        return {row[0]: exp.DataType.build(row[1], dialect=self.dialect) for row in result}
logger = <Logger sqlmesh.core.engine_adapter.databricks (WARNING)>
sqlmesh.core.engine_adapter.databricks

Arguments:

Arguments:

Inherited Members