Edit on GitHub

sqlmesh.core.engine_adapter.bigquery

View Source

   1from __future__ import annotations
   2
   3import logging
   4import typing as t
   5from collections import defaultdict
   6
   7from sqlglot import exp, parse_one
   8from sqlglot.transforms import remove_precision_parameterized_types
   9
  10from sqlmesh.core.dialect import to_schema
  11from sqlmesh.core.engine_adapter.base import _get_data_object_cache_key
  12from sqlmesh.core.engine_adapter.mixins import (
  13    ClusteredByMixin,
  14    GrantsFromInfoSchemaMixin,
  15    RowDiffMixin,
  16    TableAlterClusterByOperation,
  17)
  18from sqlmesh.core.engine_adapter.shared import (
  19    CatalogSupport,
  20    DataObject,
  21    DataObjectType,
  22    SourceQuery,
  23    set_catalog,
  24    InsertOverwriteStrategy,
  25)
  26from sqlmesh.core.node import IntervalUnit
  27from sqlmesh.core.schema_diff import TableAlterOperation, NestedSupport
  28from sqlmesh.utils import optional_import, get_source_columns_to_types
  29from sqlmesh.utils.date import to_datetime
  30from sqlmesh.utils.errors import SQLMeshError
  31from sqlmesh.utils.pandas import columns_to_types_from_dtypes
  32
  33if t.TYPE_CHECKING:
  34    import pandas as pd
  35    from google.api_core.retry import Retry
  36    from google.cloud import bigquery
  37    from google.cloud.bigquery import StandardSqlDataType
  38    from google.cloud.bigquery.client import Client as BigQueryClient
  39    from google.cloud.bigquery.job import QueryJob
  40    from google.cloud.bigquery.job.base import _AsyncJob as BigQueryQueryResult
  41    from google.cloud.bigquery.table import Table as BigQueryTable
  42
  43    from sqlmesh.core._typing import SchemaName, SessionProperties, TableName
  44    from sqlmesh.core.engine_adapter._typing import BigframeSession, DCL, DF, GrantsConfig, Query
  45    from sqlmesh.core.engine_adapter.base import QueryOrDF
  46
  47
  48logger = logging.getLogger(__name__)
  49
  50bigframes = optional_import("bigframes")
  51bigframes_pd = optional_import("bigframes.pandas")
  52
  53
  54NestedField = t.Tuple[str, str, t.List[str]]
  55NestedFieldsDict = t.Dict[str, t.List[NestedField]]
  56
  57
  58@set_catalog()
  59class BigQueryEngineAdapter(ClusteredByMixin, RowDiffMixin, GrantsFromInfoSchemaMixin):
  60    """
  61    BigQuery Engine Adapter using the `google-cloud-bigquery` library's DB API.
  62    """
  63
  64    DIALECT = "bigquery"
  65    DEFAULT_BATCH_SIZE = 1000
  66    SUPPORTS_TRANSACTIONS = False
  67    SUPPORTS_MATERIALIZED_VIEWS = True
  68    SUPPORTS_CLONING = True
  69    SUPPORTS_GRANTS = True
  70    CURRENT_USER_OR_ROLE_EXPRESSION: exp.Expr = exp.func("session_user")
  71    SUPPORTS_MULTIPLE_GRANT_PRINCIPALS = True
  72    USE_CATALOG_IN_GRANTS = True
  73    GRANT_INFORMATION_SCHEMA_TABLE_NAME = "OBJECT_PRIVILEGES"
  74    MAX_TABLE_COMMENT_LENGTH = 1024
  75    MAX_COLUMN_COMMENT_LENGTH = 1024
  76    SUPPORTS_QUERY_EXECUTION_TRACKING = True
  77    SUPPORTED_DROP_CASCADE_OBJECT_KINDS = ["SCHEMA"]
  78    INSERT_OVERWRITE_STRATEGY = InsertOverwriteStrategy.MERGE
  79
  80    SCHEMA_DIFFER_KWARGS = {
  81        "compatible_types": {
  82            exp.DataType.build("INT64", dialect=DIALECT): {
  83                exp.DataType.build("NUMERIC", dialect=DIALECT),
  84                exp.DataType.build("FLOAT64", dialect=DIALECT),
  85                exp.DataType.build("BIGNUMERIC", dialect=DIALECT),
  86            },
  87            exp.DataType.build("NUMERIC", dialect=DIALECT): {
  88                exp.DataType.build("FLOAT64", dialect=DIALECT),
  89                exp.DataType.build("BIGNUMERIC", dialect=DIALECT),
  90            },
  91            exp.DataType.build("DATE", dialect=DIALECT): {
  92                exp.DataType.build("DATETIME", dialect=DIALECT),
  93            },
  94        },
  95        "coerceable_types": {
  96            exp.DataType.build("FLOAT64", dialect=DIALECT): {
  97                exp.DataType.build("BIGNUMERIC", dialect=DIALECT),
  98            },
  99        },
 100        "support_coercing_compatible_types": True,
 101        "parameterized_type_defaults": {
 102            exp.DataType.build("DECIMAL", dialect=DIALECT).this: [(38, 9), (0,)],
 103            exp.DataType.build("BIGDECIMAL", dialect=DIALECT).this: [(76.76, 38), (0,)],
 104        },
 105        "types_with_unlimited_length": {
 106            # parameterized `STRING(n)` can ALTER to unparameterized `STRING`
 107            exp.DataType.build("STRING", dialect=DIALECT).this: {
 108                exp.DataType.build("STRING", dialect=DIALECT).this,
 109            },
 110            # parameterized `BYTES(n)` can ALTER to unparameterized `BYTES`
 111            exp.DataType.build("BYTES", dialect=DIALECT).this: {
 112                exp.DataType.build("BYTES", dialect=DIALECT).this,
 113            },
 114        },
 115        "nested_support": NestedSupport.ALL_BUT_DROP,
 116    }
 117
 118    @property
 119    def client(self) -> BigQueryClient:
 120        return self.connection._client
 121
 122    @property
 123    def bigframe(self) -> t.Optional[BigframeSession]:
 124        if bigframes:
 125            options = bigframes.BigQueryOptions(
 126                credentials=self.client._credentials,
 127                project=self.client.project,
 128                location=self.client.location,
 129            )
 130            return bigframes.connect(context=options)
 131        return None
 132
 133    @property
 134    def _job_params(self) -> t.Dict[str, t.Any]:
 135        from sqlmesh.core.config.connection import BigQueryPriority
 136
 137        params = {
 138            "use_legacy_sql": False,
 139            "priority": self._extra_config.get(
 140                "priority", BigQueryPriority.INTERACTIVE.bigquery_constant
 141            ),
 142        }
 143        if self._extra_config.get("maximum_bytes_billed") is not None:
 144            params["maximum_bytes_billed"] = self._extra_config.get("maximum_bytes_billed")
 145        if self._extra_config.get("reservation") is not None:
 146            params["reservation"] = self._extra_config.get("reservation")
 147        if self.correlation_id:
 148            # BigQuery label keys must be lowercase
 149            key = self.correlation_id.job_type.value.lower()
 150            params["labels"] = {key: self.correlation_id.job_id}
 151        return params
 152
 153    @property
 154    def catalog_support(self) -> CatalogSupport:
 155        return CatalogSupport.FULL_SUPPORT
 156
 157    def _df_to_source_queries(
 158        self,
 159        df: DF,
 160        target_columns_to_types: t.Dict[str, exp.DataType],
 161        batch_size: int,
 162        target_table: TableName,
 163        source_columns: t.Optional[t.List[str]] = None,
 164    ) -> t.List[SourceQuery]:
 165        import pandas as pd
 166
 167        source_columns_to_types = get_source_columns_to_types(
 168            target_columns_to_types, source_columns
 169        )
 170
 171        temp_bq_table = self.__get_temp_bq_table(
 172            self._get_temp_table(target_table or "pandas"), source_columns_to_types
 173        )
 174        temp_table = exp.table_(
 175            temp_bq_table.table_id,
 176            db=temp_bq_table.dataset_id,
 177            catalog=temp_bq_table.project,
 178        )
 179
 180        def query_factory() -> Query:
 181            ordered_df = df[list(source_columns_to_types)]
 182            if bigframes_pd and isinstance(ordered_df, bigframes_pd.DataFrame):
 183                ordered_df.to_gbq(
 184                    f"{temp_bq_table.project}.{temp_bq_table.dataset_id}.{temp_bq_table.table_id}",
 185                    if_exists="replace",
 186                )
 187            elif not self.table_exists(temp_table):
 188                # Make mypy happy
 189                assert isinstance(ordered_df, pd.DataFrame)
 190                self._db_call(self.client.create_table, table=temp_bq_table, exists_ok=False)
 191                result = self.__load_pandas_to_table(
 192                    temp_bq_table, ordered_df, source_columns_to_types, replace=False
 193                )
 194                if result.errors:
 195                    raise SQLMeshError(result.errors)
 196            return exp.select(
 197                *self._casted_columns(target_columns_to_types, source_columns=source_columns)
 198            ).from_(temp_table)
 199
 200        return [
 201            SourceQuery(
 202                query_factory=query_factory,
 203                cleanup_func=lambda: self.drop_table(temp_table),
 204            )
 205        ]
 206
 207    def close(self) -> t.Any:
 208        # Cancel all pending query jobs across all threads
 209        all_query_jobs = self._connection_pool.get_all_attributes("query_job")
 210        for query_job in all_query_jobs:
 211            if query_job:
 212                try:
 213                    if not self._db_call(query_job.done):
 214                        self._db_call(query_job.cancel)
 215                        logger.debug(
 216                            "Cancelled BigQuery job: https://console.cloud.google.com/bigquery?project=%s&j=bq:%s:%s",
 217                            query_job.project,
 218                            query_job.location,
 219                            query_job.job_id,
 220                        )
 221                except Exception as ex:
 222                    logger.debug(
 223                        "Failed to cancel BigQuery job: https://console.cloud.google.com/bigquery?project=%s&j=bq:%s:%s. %s",
 224                        query_job.project,
 225                        query_job.location,
 226                        query_job.job_id,
 227                        str(ex),
 228                    )
 229
 230        return super().close()
 231
 232    def _begin_session(self, properties: SessionProperties) -> None:
 233        from google.cloud.bigquery import QueryJobConfig
 234
 235        query_label_property = properties.get("query_label")
 236        parsed_query_label: list[tuple[str, str]] = []
 237        if isinstance(query_label_property, (exp.Array, exp.Paren, exp.Tuple)):
 238            label_tuples = (
 239                [query_label_property.unnest()]
 240                if isinstance(query_label_property, exp.Paren)
 241                else query_label_property.expressions
 242            )
 243
 244            # query_label is a Paren, Array or Tuple of 2-tuples and validated at load time
 245            parsed_query_label.extend(
 246                (label_tuple.expressions[0].name, label_tuple.expressions[1].name)
 247                for label_tuple in label_tuples
 248            )
 249        elif query_label_property is not None:
 250            raise SQLMeshError(
 251                "Invalid value for `session_properties.query_label`. Must be an array or tuple."
 252            )
 253
 254        if self.correlation_id:
 255            parsed_query_label.append(
 256                (self.correlation_id.job_type.value.lower(), self.correlation_id.job_id)
 257            )
 258
 259        if parsed_query_label:
 260            query_label_str = ",".join([":".join(label) for label in parsed_query_label])
 261            query = f'SET @@query_label = "{query_label_str}";SELECT 1;'
 262        else:
 263            query = "SELECT 1;"
 264
 265        job = self.client.query(
 266            query,
 267            job_config=QueryJobConfig(create_session=True),
 268        )
 269        session_info = job.session_info
 270        session_id = session_info.session_id if session_info else None
 271        self._session_id = session_id
 272        job.result()
 273
 274    def _end_session(self) -> None:
 275        self._session_id = None
 276
 277    def _is_session_active(self) -> bool:
 278        return self._session_id is not None
 279
 280    def get_current_catalog(self) -> t.Optional[str]:
 281        """Returns the catalog name of the current connection."""
 282        return self.client.project
 283
 284    def set_current_catalog(self, catalog: str) -> None:
 285        """Sets the catalog name of the current connection."""
 286        self.client.project = catalog
 287
 288    def create_schema(
 289        self,
 290        schema_name: SchemaName,
 291        ignore_if_exists: bool = True,
 292        warn_on_error: bool = True,
 293        properties: t.List[exp.Expr] = [],
 294    ) -> None:
 295        """Create a schema from a name or qualified table name."""
 296        from google.api_core.exceptions import Conflict
 297
 298        try:
 299            super().create_schema(
 300                schema_name,
 301                ignore_if_exists=ignore_if_exists,
 302                warn_on_error=False,
 303            )
 304        except Exception as e:
 305            is_already_exists_error = isinstance(e, Conflict) and "Already Exists:" in str(e)
 306            if is_already_exists_error and ignore_if_exists:
 307                return
 308            if not warn_on_error:
 309                raise
 310            logger.warning("Failed to create schema '%s': %s", schema_name, e)
 311
 312    def get_bq_schema(self, table_name: TableName) -> t.List[bigquery.SchemaField]:
 313        table = exp.to_table(table_name)
 314        if len(table.parts) == 3 and "." in table.name:
 315            self.execute(exp.select("*").from_(table).limit(0))
 316            query_job = self._query_job
 317            assert query_job is not None
 318            return query_job._query_results.schema
 319        return self._get_table(table).schema
 320
 321    def columns(
 322        self, table_name: TableName, include_pseudo_columns: bool = False
 323    ) -> t.Dict[str, exp.DataType]:
 324        """Fetches column names and types for the target table."""
 325
 326        def dtype_to_sql(
 327            dtype: t.Optional[StandardSqlDataType], field: bigquery.SchemaField
 328        ) -> str:
 329            assert dtype
 330            assert field
 331
 332            kind = dtype.type_kind
 333            assert kind
 334
 335            # Not using the enum value to preserve compatibility with older versions
 336            # of the BigQuery library.
 337            if kind.name == "ARRAY":
 338                return f"ARRAY<{dtype_to_sql(dtype.array_element_type, field)}>"
 339            if kind.name == "STRUCT":
 340                struct_type = dtype.struct_type
 341                assert struct_type
 342                fields = ", ".join(
 343                    f"{struct_field.name} {dtype_to_sql(struct_field.type, nested_field)}"
 344                    for struct_field, nested_field in zip(struct_type.fields, field.fields)
 345                )
 346                return f"STRUCT<{fields}>"
 347            if kind.name == "TYPE_KIND_UNSPECIFIED":
 348                field_type = field.field_type
 349
 350                if field_type == "RANGE":
 351                    # If the field is a RANGE then `range_element_type` should be set to
 352                    # one of `"DATE"`, `"DATETIME"` or `"TIMESTAMP"`.
 353                    return f"RANGE<{field.range_element_type.element_type}>"
 354
 355                return field_type
 356
 357            return kind.name
 358
 359        def create_mapping_schema(
 360            schema: t.Sequence[bigquery.SchemaField],
 361        ) -> t.Dict[str, exp.DataType]:
 362            return {
 363                field.name: exp.DataType.build(
 364                    dtype_to_sql(field.to_standard_sql().type, field), dialect=self.dialect
 365                )
 366                for field in schema
 367            }
 368
 369        table = exp.to_table(table_name)
 370        if len(table.parts) == 3 and "." in table.name:
 371            # The client's `get_table` method can't handle paths with >3 identifiers
 372            self.execute(exp.select("*").from_(table).limit(0))
 373            query_job = self._query_job
 374            assert query_job is not None
 375
 376            query_results = query_job._query_results
 377            columns = create_mapping_schema(query_results.schema)
 378        else:
 379            bq_table = self._get_table(table)
 380            columns = create_mapping_schema(bq_table.schema)
 381
 382            if include_pseudo_columns:
 383                if bq_table.time_partitioning and not bq_table.time_partitioning.field:
 384                    columns["_PARTITIONTIME"] = exp.DataType.build("TIMESTAMP", dialect="bigquery")
 385                    if bq_table.time_partitioning.type_ == "DAY":
 386                        columns["_PARTITIONDATE"] = exp.DataType.build("DATE")
 387                if bq_table.table_id.endswith("*"):
 388                    columns["_TABLE_SUFFIX"] = exp.DataType.build("STRING", dialect="bigquery")
 389                if (
 390                    bq_table.external_data_configuration is not None
 391                    and bq_table.external_data_configuration.source_format
 392                    in (
 393                        "CSV",
 394                        "NEWLINE_DELIMITED_JSON",
 395                        "AVRO",
 396                        "PARQUET",
 397                        "ORC",
 398                        "DATASTORE_BACKUP",
 399                    )
 400                ):
 401                    columns["_FILE_NAME"] = exp.DataType.build("STRING", dialect="bigquery")
 402
 403        return columns
 404
 405    def alter_table(
 406        self,
 407        alter_expressions: t.Union[t.List[exp.Alter], t.List[TableAlterOperation]],
 408    ) -> None:
 409        """
 410        Performs the alter statements to change the current table into the structure of the target table,
 411        and uses the API to add columns to structs, where SQL is not supported.
 412        """
 413        if not alter_expressions:
 414            return
 415
 416        cluster_by_operations, alter_statements = [], []
 417        for e in alter_expressions:
 418            if isinstance(e, TableAlterClusterByOperation):
 419                cluster_by_operations.append(e)
 420            elif isinstance(e, TableAlterOperation):
 421                alter_statements.append(e.expression)
 422            else:
 423                alter_statements.append(e)
 424
 425        for op in cluster_by_operations:
 426            self._update_clustering_key(op)
 427
 428        nested_fields, non_nested_expressions = self._split_alter_expressions(alter_statements)
 429
 430        if nested_fields:
 431            self._update_table_schema_nested_fields(nested_fields, alter_statements[0].this)
 432
 433        if non_nested_expressions:
 434            super().alter_table(non_nested_expressions)
 435
 436    def fetchone(
 437        self,
 438        query: t.Union[exp.Expr, str],
 439        ignore_unsupported_errors: bool = False,
 440        quote_identifiers: bool = False,
 441    ) -> t.Optional[t.Tuple]:
 442        """
 443        BigQuery's `fetchone` method doesn't call execute and therefore would not benefit from the execute
 444        configuration we have in place. Therefore this implementation calls execute instead.
 445        """
 446        self.execute(
 447            query,
 448            ignore_unsupported_errors=ignore_unsupported_errors,
 449            quote_identifiers=quote_identifiers,
 450        )
 451        try:
 452            return next(self._query_data)
 453        except StopIteration:
 454            return None
 455
 456    def fetchall(
 457        self,
 458        query: t.Union[exp.Expr, str],
 459        ignore_unsupported_errors: bool = False,
 460        quote_identifiers: bool = False,
 461    ) -> t.List[t.Tuple]:
 462        """
 463        BigQuery's `fetchone` method doesn't call execute and therefore would not benefit from the execute
 464        configuration we have in place. Therefore this implementation calls execute instead.
 465        """
 466        self.execute(
 467            query,
 468            ignore_unsupported_errors=ignore_unsupported_errors,
 469            quote_identifiers=quote_identifiers,
 470        )
 471        return list(self._query_data)
 472
 473    def _split_alter_expressions(
 474        self,
 475        alter_expressions: t.List[exp.Alter],
 476    ) -> t.Tuple[NestedFieldsDict, t.List[exp.Alter]]:
 477        """
 478        Returns a dictionary of the nested fields to add and a list of the non-nested alter expressions.
 479        """
 480        nested_fields_to_add: NestedFieldsDict = defaultdict(list)
 481        non_nested_expressions = []
 482
 483        for alter_expression in alter_expressions:
 484            action = alter_expression.args["actions"][0]
 485            if (
 486                isinstance(action, exp.ColumnDef)
 487                and isinstance(action.this, exp.Dot)
 488                and isinstance(action.kind, exp.DataType)
 489            ):
 490                root_field, *leaf_fields = action.this.this.sql(dialect=self.dialect).split(".")
 491                new_field = action.this.expression.sql(dialect=self.dialect)
 492                data_type = action.kind.sql(dialect=self.dialect)
 493                nested_fields_to_add[root_field].append((new_field, data_type, leaf_fields))
 494            else:
 495                non_nested_expressions.append(alter_expression)
 496
 497        return nested_fields_to_add, non_nested_expressions
 498
 499    def _build_nested_fields(
 500        self,
 501        current_fields: t.List[bigquery.SchemaField],
 502        fields_to_add: t.List[NestedField],
 503    ) -> t.List[bigquery.SchemaField]:
 504        """
 505        Recursively builds and updates the schema fields with the new nested fields.
 506        """
 507        from google.cloud import bigquery
 508
 509        new_fields = []
 510        root: t.List[t.Tuple[str, str]] = []
 511        leaves: NestedFieldsDict = defaultdict(list)
 512        for new_field, data_type, leaf_fields in fields_to_add:
 513            if leaf_fields:
 514                leaves[leaf_fields[0]].append((new_field, data_type, leaf_fields[1:]))
 515            else:
 516                root.append((new_field, data_type))
 517
 518        for field in current_fields:
 519            # If the new fields are nested, we need to recursively build them
 520            if field.name in leaves:
 521                subfields = list(field.fields)
 522                subfields = self._build_nested_fields(subfields, leaves[field.name])
 523                new_fields.append(
 524                    bigquery.SchemaField(
 525                        field.name, "RECORD", mode=field.mode, fields=tuple(subfields)
 526                    )
 527                )
 528            else:
 529                new_fields.append(field)
 530
 531        # Build and append the new root-level fields
 532        new_fields.extend(
 533            self.__get_bq_schemafield(
 534                new_field[0], exp.DataType.build(new_field[1], dialect=self.dialect)
 535            )
 536            for new_field in root
 537        )
 538        return new_fields
 539
 540    def _update_table_schema_nested_fields(
 541        self, nested_fields_to_add: NestedFieldsDict, table_name: str
 542    ) -> None:
 543        """
 544        Updates a BigQuery table schema by adding the new nested fields provided.
 545        """
 546        from google.cloud import bigquery
 547
 548        table = self._get_table(table_name)
 549        original_schema = table.schema
 550        new_schema = []
 551        for field in original_schema:
 552            if field.name in nested_fields_to_add:
 553                fields = self._build_nested_fields(
 554                    list(field.fields), nested_fields_to_add[field.name]
 555                )
 556                new_schema.append(
 557                    bigquery.SchemaField(
 558                        field.name,
 559                        "RECORD",
 560                        mode=field.mode,
 561                        fields=tuple(fields),
 562                    )
 563                )
 564            else:
 565                new_schema.append(field)
 566
 567        if new_schema != original_schema:
 568            table.schema = new_schema
 569            self.client.update_table(table, ["schema"])
 570
 571    def __load_pandas_to_table(
 572        self,
 573        table: bigquery.Table,
 574        df: pd.DataFrame,
 575        columns_to_types: t.Dict[str, exp.DataType],
 576        replace: bool = False,
 577    ) -> BigQueryQueryResult:
 578        """
 579        Loads a pandas dataframe into a table in BigQuery. Will do an overwrite if replace is True. Note that
 580        the replace will replace the entire table, not just the rows that are in the dataframe.
 581        """
 582        from google.cloud import bigquery
 583
 584        job_config = bigquery.job.LoadJobConfig(schema=self.__get_bq_schema(columns_to_types))
 585        if replace:
 586            job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
 587        logger.info(f"Loading dataframe to BigQuery. Table Path: {table.path}")
 588        # This client call does not support retry so we don't use the `_db_call` method.
 589        result = self.__retry(
 590            self.__db_load_table_from_dataframe,
 591        )(df=df, table=table, job_config=job_config)
 592        if result.errors:
 593            raise SQLMeshError(result.errors)
 594        return result
 595
 596    def __db_load_table_from_dataframe(
 597        self, df: pd.DataFrame, table: bigquery.Table, job_config: bigquery.LoadJobConfig
 598    ) -> BigQueryQueryResult:
 599        job = self.client.load_table_from_dataframe(
 600            dataframe=df, destination=table, job_config=job_config
 601        )
 602        return self._db_call(job.result)
 603
 604    def __get_bq_schemafield(self, name: str, tpe: exp.DataType) -> bigquery.SchemaField:
 605        from google.cloud import bigquery
 606
 607        mode = "NULLABLE"
 608        if tpe.is_type(exp.DataType.Type.ARRAY):
 609            mode = "REPEATED"
 610            tpe = tpe.expressions[0]
 611
 612        field_type = tpe.sql(dialect=self.dialect)
 613        fields = []
 614        if tpe.is_type(*exp.DataType.NESTED_TYPES):
 615            field_type = "RECORD"
 616            for inner_field in tpe.expressions:
 617                if isinstance(inner_field, exp.ColumnDef):
 618                    inner_name = inner_field.this.sql(dialect=self.dialect)
 619                    inner_type = inner_field.kind
 620                    if inner_type is None:
 621                        raise ValueError(
 622                            f"cannot convert unknown type to BQ schema field {inner_field}"
 623                        )
 624                    fields.append(self.__get_bq_schemafield(name=inner_name, tpe=inner_type))
 625                else:
 626                    raise ValueError(f"unexpected nested expression {inner_field}")
 627
 628        return bigquery.SchemaField(
 629            name=name,
 630            field_type=field_type,
 631            mode=mode,
 632            fields=fields,
 633        )
 634
 635    def __get_bq_schema(
 636        self, columns_to_types: t.Dict[str, exp.DataType]
 637    ) -> t.List[bigquery.SchemaField]:
 638        """
 639        Returns a bigquery schema object from a dictionary of column names to types.
 640        """
 641
 642        precisionless_col_to_types = {
 643            col_name: remove_precision_parameterized_types(col_type)
 644            for col_name, col_type in columns_to_types.items()
 645        }
 646        return [
 647            self.__get_bq_schemafield(name=col_name, tpe=t.cast(exp.DataType, col_type))
 648            for col_name, col_type in precisionless_col_to_types.items()
 649        ]
 650
 651    def __get_temp_bq_table(
 652        self, table: exp.Table, columns_to_type: t.Dict[str, exp.DataType]
 653    ) -> bigquery.Table:
 654        """
 655        Returns a bigquery table object that is temporary and will expire in 3 hours.
 656        """
 657        bq_table = self.__get_bq_table(table, columns_to_type)
 658        bq_table.expires = to_datetime("in 3 hours")
 659        return bq_table
 660
 661    def __get_bq_table(
 662        self, table: TableName, columns_to_type: t.Dict[str, exp.DataType]
 663    ) -> bigquery.Table:
 664        """
 665        Returns a bigquery table object with a schema defines that matches the columns_to_type dictionary.
 666        """
 667        from google.cloud import bigquery
 668
 669        table_ = exp.to_table(table).copy()
 670
 671        if not table_.catalog:
 672            table_.set("catalog", exp.to_identifier(self.default_catalog))
 673
 674        return bigquery.Table(
 675            table_ref=self._table_name(table_),
 676            schema=self.__get_bq_schema(columns_to_type),
 677        )
 678
 679    @property
 680    def __retry(self) -> Retry:
 681        from google.api_core import retry
 682
 683        return retry.Retry(
 684            predicate=_ErrorCounter(self._extra_config["job_retries"]).should_retry,
 685            deadline=self._extra_config.get("job_retry_deadline_seconds"),
 686            initial=1.0,
 687            maximum=3.0,
 688        )
 689
 690    def insert_overwrite_by_partition(
 691        self,
 692        table_name: TableName,
 693        query_or_df: QueryOrDF,
 694        partitioned_by: t.List[exp.Expr],
 695        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 696        source_columns: t.Optional[t.List[str]] = None,
 697    ) -> None:
 698        if len(partitioned_by) != 1:
 699            raise SQLMeshError(
 700                f"Bigquery only supports partitioning by one column, {len(partitioned_by)} were provided."
 701            )
 702
 703        partition_exp = partitioned_by[0]
 704        partition_column = partition_exp.find(exp.Column)
 705
 706        granularity = partition_exp.args.get("unit")
 707        if granularity:
 708            granularity = granularity.name.lower()
 709
 710        if not partition_column:
 711            partition_sql = partition_exp.sql(dialect=self.dialect)
 712            raise SQLMeshError(
 713                f"The partition expression '{partition_sql}' doesn't contain a column."
 714            )
 715        with (
 716            self.session({}),
 717            self.temp_table(
 718                query_or_df,
 719                name=table_name,
 720                partitioned_by=partitioned_by,
 721                source_columns=source_columns,
 722            ) as temp_table_name,
 723        ):
 724            if target_columns_to_types is None or target_columns_to_types[
 725                partition_column.name
 726            ] == exp.DataType.build("unknown"):
 727                target_columns_to_types = self.columns(table_name)
 728
 729            partition_type_sql = target_columns_to_types[partition_column.name].sql(
 730                dialect=self.dialect
 731            )
 732
 733            select_array_agg_partitions = select_partitions_expr(
 734                temp_table_name.db,
 735                temp_table_name.name,
 736                partition_type_sql,
 737                granularity=granularity,
 738                agg_func="ARRAY_AGG",
 739                catalog=temp_table_name.catalog or self.default_catalog,
 740            )
 741
 742            self.execute(
 743                f"DECLARE _sqlmesh_target_partitions_ ARRAY<{partition_type_sql}> DEFAULT ({select_array_agg_partitions});"
 744            )
 745
 746            where = t.cast(exp.Condition, partition_exp).isin(unnest="_sqlmesh_target_partitions_")
 747
 748            self._insert_overwrite_by_condition(
 749                table_name,
 750                [SourceQuery(query_factory=lambda: exp.select("*").from_(temp_table_name))],
 751                target_columns_to_types,
 752                where=where,
 753            )
 754
 755    def table_exists(self, table_name: TableName) -> bool:
 756        table = exp.to_table(table_name)
 757        data_object_cache_key = _get_data_object_cache_key(table.catalog, table.db, table.name)
 758        if data_object_cache_key in self._data_object_cache:
 759            logger.debug("Table existence cache hit: %s", data_object_cache_key)
 760            return self._data_object_cache[data_object_cache_key] is not None
 761
 762        try:
 763            from google.cloud.exceptions import NotFound
 764        except ModuleNotFoundError:
 765            from google.api_core.exceptions import NotFound
 766
 767        try:
 768            self._get_table(table_name)
 769            return True
 770        except NotFound:
 771            return False
 772
 773    def get_table_last_modified_ts(self, table_names: t.List[TableName]) -> t.List[int]:
 774        from sqlmesh.utils.date import to_timestamp
 775
 776        datasets_to_tables: t.DefaultDict[str, t.List[str]] = defaultdict(list)
 777        for table_name in table_names:
 778            table = exp.to_table(table_name)
 779            datasets_to_tables[table.db].append(table.name)
 780
 781        results = []
 782
 783        for dataset, tables in datasets_to_tables.items():
 784            query = (
 785                f"SELECT TIMESTAMP_MILLIS(last_modified_time) FROM `{dataset}.__TABLES__` WHERE "
 786            )
 787            for i, table_name in enumerate(tables):
 788                query += f"TABLE_ID = '{table_name}'"
 789                if i < len(tables) - 1:
 790                    query += " OR "
 791            results.extend(self.fetchall(query))
 792
 793        return [to_timestamp(row[0]) for row in results]
 794
 795    def _get_table(self, table_name: TableName) -> BigQueryTable:
 796        """
 797        Returns a BigQueryTable object for the given table name.
 798
 799        Raises: `google.cloud.exceptions.NotFound` if the table does not exist.
 800        """
 801        return self._db_call(self.client.get_table, table=self._table_name(table_name))
 802
 803    def _table_name(self, table_name: TableName) -> str:
 804        # the api doesn't support backticks, so we can't call exp.table_name or sql
 805        return ".".join(part.name for part in exp.to_table(table_name).parts)
 806
 807    def _fetch_native_df(
 808        self, query: t.Union[exp.Expr, str], quote_identifiers: bool = False
 809    ) -> DF:
 810        self.execute(query, quote_identifiers=quote_identifiers)
 811        query_job = self._query_job
 812        assert query_job is not None
 813        return query_job.to_dataframe()
 814
 815    def _create_column_comments(
 816        self,
 817        table_name: TableName,
 818        column_comments: t.Dict[str, str],
 819        table_kind: str = "TABLE",
 820        materialized_view: bool = False,
 821    ) -> None:
 822        if not (table_kind == "VIEW" and materialized_view):
 823            table = self._get_table(table_name)
 824
 825            # convert Table object to dict
 826            table_def = table.to_api_repr()
 827
 828            # Set column descriptions, supporting nested fields (e.g. record.field.nested_field)
 829            for column, comment in column_comments.items():
 830                fields = table_def["schema"]["fields"]
 831                field_names = column.split(".")
 832                last_index = len(field_names) - 1
 833
 834                # Traverse the fields with nested fields down to leaf level
 835                for idx, name in enumerate(field_names):
 836                    if field := next((field for field in fields if field["name"] == name), None):
 837                        if idx == last_index:
 838                            field["description"] = self._truncate_comment(
 839                                comment, self.MAX_COLUMN_COMMENT_LENGTH
 840                            )
 841                        else:
 842                            fields = field.get("fields") or []
 843
 844            # An "etag" is BQ versioning metadata that changes when an object is updated/modified. `update_table`
 845            # compares the etags of the table object passed to it and the remote table, erroring if the etags
 846            # don't match. We set the local etag to None to avoid this check.
 847            table_def["etag"] = None
 848
 849            # convert dict back to a Table object
 850            table = table.from_api_repr(table_def)
 851
 852            # update table schema
 853            logger.info(f"Registering column comments for table {table_name}")
 854            self._db_call(self.client.update_table, table=table, fields=["schema"])
 855
 856    def _build_description_property_exp(
 857        self,
 858        description: str,
 859        trunc_method: t.Callable,
 860    ) -> exp.Property:
 861        return exp.Property(
 862            this=exp.to_identifier("description", quoted=True),
 863            value=exp.Literal.string(trunc_method(description)),
 864        )
 865
 866    def _build_partitioned_by_exp(
 867        self,
 868        partitioned_by: t.List[exp.Expr],
 869        *,
 870        partition_interval_unit: t.Optional[IntervalUnit] = None,
 871        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 872        **kwargs: t.Any,
 873    ) -> t.Optional[exp.PartitionedByProperty]:
 874        if len(partitioned_by) > 1:
 875            raise SQLMeshError("BigQuery only supports partitioning by a single column")
 876
 877        this = partitioned_by[0]
 878        if (
 879            isinstance(this, exp.Column)
 880            and partition_interval_unit is not None
 881            and not partition_interval_unit.is_minute
 882        ):
 883            column_type: t.Optional[exp.DataType] = (target_columns_to_types or {}).get(this.name)
 884
 885            if column_type == exp.DataType.build(
 886                "date", dialect=self.dialect
 887            ) and partition_interval_unit in (
 888                IntervalUnit.MONTH,
 889                IntervalUnit.YEAR,
 890            ):
 891                trunc_func = "DATE_TRUNC"
 892            elif column_type == exp.DataType.build("timestamp", dialect=self.dialect):
 893                trunc_func = "TIMESTAMP_TRUNC"
 894            elif column_type == exp.DataType.build("datetime", dialect=self.dialect):
 895                trunc_func = "DATETIME_TRUNC"
 896            else:
 897                trunc_func = ""
 898
 899            if trunc_func:
 900                this = exp.func(
 901                    trunc_func,
 902                    this,
 903                    exp.var(partition_interval_unit.value.upper()),
 904                    dialect=self.dialect,
 905                )
 906
 907        return exp.PartitionedByProperty(this=this)
 908
 909    def _build_table_properties_exp(
 910        self,
 911        catalog_name: t.Optional[str] = None,
 912        table_format: t.Optional[str] = None,
 913        storage_format: t.Optional[str] = None,
 914        partitioned_by: t.Optional[t.List[exp.Expr]] = None,
 915        partition_interval_unit: t.Optional[IntervalUnit] = None,
 916        clustered_by: t.Optional[t.List[exp.Expr]] = None,
 917        table_properties: t.Optional[t.Dict[str, exp.Expr]] = None,
 918        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 919        table_description: t.Optional[str] = None,
 920        table_kind: t.Optional[str] = None,
 921        **kwargs: t.Any,
 922    ) -> t.Optional[exp.Properties]:
 923        properties: t.List[exp.Expr] = []
 924
 925        if partitioned_by and (
 926            partitioned_by_prop := self._build_partitioned_by_exp(
 927                partitioned_by,
 928                partition_interval_unit=partition_interval_unit,
 929                target_columns_to_types=target_columns_to_types,
 930            )
 931        ):
 932            properties.append(partitioned_by_prop)
 933
 934        if clustered_by and (clustered_by_exp := self._build_clustered_by_exp(clustered_by)):
 935            properties.append(clustered_by_exp)
 936
 937        if table_description:
 938            properties.append(
 939                self._build_description_property_exp(
 940                    table_description, self._truncate_table_comment
 941                ),
 942            )
 943
 944        properties.extend(self._table_or_view_properties_to_expressions(table_properties))
 945
 946        if properties:
 947            return exp.Properties(expressions=properties)
 948        return None
 949
 950    def _build_column_def(
 951        self,
 952        col_name: str,
 953        column_descriptions: t.Optional[t.Dict[str, str]] = None,
 954        engine_supports_schema_comments: bool = False,
 955        col_type: t.Optional[exp.DATA_TYPE] = None,
 956        nested_names: t.List[str] = [],
 957    ) -> exp.ColumnDef:
 958        # Helper function to build column definitions with column descriptions
 959        def _build_struct_with_descriptions(
 960            col_type: exp.DataType,
 961            nested_names: t.List[str],
 962        ) -> exp.DataType:
 963            column_expressions = []
 964            for column_def in col_type.expressions:
 965                # This is expected to  be true, but this check is included as a
 966                # precautionary measure in case of an unexpected edge case
 967                if isinstance(column_def, exp.ColumnDef):
 968                    column = self._build_column_def(
 969                        col_name=column_def.name,
 970                        column_descriptions=column_descriptions,
 971                        engine_supports_schema_comments=engine_supports_schema_comments,
 972                        col_type=column_def.kind,
 973                        nested_names=nested_names,
 974                    )
 975                else:
 976                    column = column_def
 977                column_expressions.append(column)
 978            return exp.DataType(this=col_type.this, expressions=column_expressions, nested=True)
 979
 980        # Recursively build column definitions for BigQuery's RECORDs (struct) and REPEATED RECORDs (array of struct)
 981        if isinstance(col_type, exp.DataType) and col_type.expressions:
 982            expressions = col_type.expressions
 983            if col_type.is_type(exp.DataType.Type.STRUCT):
 984                col_type = _build_struct_with_descriptions(col_type, nested_names + [col_name])
 985            elif col_type.is_type(exp.DataType.Type.ARRAY) and expressions[0].is_type(
 986                exp.DataType.Type.STRUCT
 987            ):
 988                col_type = exp.DataType(
 989                    this=exp.DataType.Type.ARRAY,
 990                    expressions=[
 991                        _build_struct_with_descriptions(
 992                            col_type.expressions[0], nested_names + [col_name]
 993                        )
 994                    ],
 995                    nested=True,
 996                )
 997
 998        return exp.ColumnDef(
 999            this=exp.to_identifier(col_name),
1000            kind=col_type,
1001            constraints=(
1002                self._build_col_comment_exp(
1003                    ".".join(nested_names + [col_name]), column_descriptions
1004                )
1005                if engine_supports_schema_comments and self.comments_enabled and column_descriptions
1006                else None
1007            ),
1008        )
1009
1010    def _build_col_comment_exp(
1011        self, col_name: str, column_descriptions: t.Dict[str, str]
1012    ) -> t.List[exp.ColumnConstraint]:
1013        comment = column_descriptions.get(col_name, None)
1014        if comment:
1015            return [
1016                exp.ColumnConstraint(
1017                    kind=exp.Properties(
1018                        expressions=[
1019                            self._build_description_property_exp(
1020                                comment, self._truncate_column_comment
1021                            ),
1022                        ]
1023                    )
1024                )
1025            ]
1026        return []
1027
1028    def _build_view_properties_exp(
1029        self,
1030        view_properties: t.Optional[t.Dict[str, exp.Expr]] = None,
1031        table_description: t.Optional[str] = None,
1032        **kwargs: t.Any,
1033    ) -> t.Optional[exp.Properties]:
1034        """Creates a SQLGlot table properties expression for view"""
1035        properties: t.List[exp.Expr] = []
1036
1037        if table_description:
1038            properties.append(
1039                self._build_description_property_exp(
1040                    table_description, self._truncate_table_comment
1041                ),
1042            )
1043
1044        properties.extend(self._table_or_view_properties_to_expressions(view_properties))
1045
1046        if properties:
1047            return exp.Properties(expressions=properties)
1048        return None
1049
1050    def _build_create_comment_table_exp(
1051        self, table: exp.Table, table_comment: str, table_kind: str
1052    ) -> exp.Comment | str:
1053        table_sql = table.sql(dialect=self.dialect, identify=True)
1054
1055        truncated_comment = self._truncate_table_comment(table_comment)
1056        comment_sql = exp.Literal.string(truncated_comment).sql(dialect=self.dialect)
1057
1058        return f"ALTER {table_kind} {table_sql} SET OPTIONS(description = {comment_sql})"
1059
1060    def _build_create_comment_column_exp(
1061        self, table: exp.Table, column_name: str, column_comment: str, table_kind: str = "TABLE"
1062    ) -> exp.Comment | str:
1063        table_sql = table.sql(dialect=self.dialect, identify=True)
1064        column_sql = exp.column(column_name).sql(dialect=self.dialect, identify=True)
1065
1066        truncated_comment = self._truncate_column_comment(column_comment)
1067        comment_sql = exp.Literal.string(truncated_comment).sql(dialect=self.dialect)
1068
1069        return f"ALTER {table_kind} {table_sql} ALTER COLUMN {column_sql} SET OPTIONS(description = {comment_sql})"
1070
1071    def create_state_table(
1072        self,
1073        table_name: str,
1074        target_columns_to_types: t.Dict[str, exp.DataType],
1075        primary_key: t.Optional[t.Tuple[str, ...]] = None,
1076    ) -> None:
1077        self.create_table(
1078            table_name,
1079            target_columns_to_types,
1080        )
1081
1082    def _db_call(self, func: t.Callable[..., t.Any], *args: t.Any, **kwargs: t.Any) -> t.Any:
1083        return func(
1084            retry=self.__retry,
1085            *args,
1086            **kwargs,
1087        )
1088
1089    def _execute(
1090        self,
1091        sql: str,
1092        track_rows_processed: bool = False,
1093        **kwargs: t.Any,
1094    ) -> None:
1095        """Execute a sql query."""
1096        from google.cloud.bigquery import QueryJobConfig
1097        from google.cloud.bigquery.query import ConnectionProperty
1098
1099        # BigQuery's Python DB API implementation does not support retries, so we have to implement them ourselves.
1100        # So we update the cursor's query job and query data with the results of the new query job. This makes sure
1101        # that other cursor based operations execute correctly.
1102        session_id = self._session_id
1103        connection_properties = (
1104            [
1105                ConnectionProperty(key="session_id", value=session_id),
1106            ]
1107            if session_id
1108            else []
1109        )
1110
1111        # Create job config
1112        job_config = QueryJobConfig(**self._job_params, connection_properties=connection_properties)
1113
1114        self._query_job = self._db_call(
1115            self.client.query,
1116            query=sql,
1117            job_config=job_config,
1118            timeout=self._extra_config.get("job_creation_timeout_seconds"),
1119        )
1120        query_job = self._query_job
1121        assert query_job is not None
1122
1123        logger.debug(
1124            "BigQuery job created: https://console.cloud.google.com/bigquery?project=%s&j=bq:%s:%s",
1125            query_job.project,
1126            query_job.location,
1127            query_job.job_id,
1128        )
1129
1130        results = self._db_call(
1131            query_job.result,
1132            timeout=self._extra_config.get("job_execution_timeout_seconds"),  # type: ignore
1133        )
1134
1135        self._query_data = iter(results) if results.total_rows else iter([])
1136        query_results = query_job._query_results
1137        self.cursor._set_rowcount(query_results)
1138        self.cursor._set_description(query_results.schema)
1139
1140        if (
1141            track_rows_processed
1142            and self._query_execution_tracker
1143            and self._query_execution_tracker.is_tracking()
1144        ):
1145            num_rows = None
1146            if query_job.statement_type == "CREATE_TABLE_AS_SELECT":
1147                # since table was just created, number rows in table == number rows processed
1148                query_table = self.client.get_table(query_job.destination)
1149                num_rows = query_table.num_rows
1150            elif query_job.statement_type in ["INSERT", "DELETE", "MERGE", "UPDATE"]:
1151                num_rows = query_job.num_dml_affected_rows
1152
1153            self._query_execution_tracker.record_execution(
1154                sql, num_rows, query_job.total_bytes_processed
1155            )
1156
1157    def _get_data_objects(
1158        self, schema_name: SchemaName, object_names: t.Optional[t.Set[str]] = None
1159    ) -> t.List[DataObject]:
1160        """
1161        Returns all the data objects that exist in the given schema and optionally catalog.
1162        """
1163
1164        # The BigQuery Client's list_tables method does not support filtering by table name, so we have to
1165        # resort to using SQL instead.
1166        schema = to_schema(schema_name)
1167        catalog = schema.catalog or self.default_catalog
1168        query = (
1169            exp.select(
1170                exp.column("table_catalog").as_("catalog"),
1171                exp.column("table_name").as_("name"),
1172                exp.column("table_schema").as_("schema_name"),
1173                exp.case()
1174                .when(exp.column("table_type").eq("BASE TABLE"), exp.Literal.string("TABLE"))
1175                .when(exp.column("table_type").eq("CLONE"), exp.Literal.string("TABLE"))
1176                .when(exp.column("table_type").eq("EXTERNAL"), exp.Literal.string("TABLE"))
1177                .when(exp.column("table_type").eq("SNAPSHOT"), exp.Literal.string("TABLE"))
1178                .when(exp.column("table_type").eq("VIEW"), exp.Literal.string("VIEW"))
1179                .when(
1180                    exp.column("table_type").eq("MATERIALIZED VIEW"),
1181                    exp.Literal.string("MATERIALIZED_VIEW"),
1182                )
1183                .else_(exp.column("table_type"))
1184                .as_("type"),
1185                exp.column("clustering_key", "ci").as_("clustering_key"),
1186            )
1187            .with_(
1188                "clustering_info",
1189                as_=exp.select(
1190                    exp.column("table_catalog"),
1191                    exp.column("table_schema"),
1192                    exp.column("table_name"),
1193                    parse_one(
1194                        "string_agg(column_name order by clustering_ordinal_position)",
1195                        dialect=self.dialect,
1196                    ).as_("clustering_key"),
1197                )
1198                .from_(
1199                    exp.to_table(
1200                        f"`{catalog}`.`{schema.db}`.INFORMATION_SCHEMA.COLUMNS",
1201                        dialect=self.dialect,
1202                    )
1203                )
1204                .where(exp.column("clustering_ordinal_position").is_(exp.not_(exp.null())))
1205                .group_by("1", "2", "3"),
1206            )
1207            .from_(
1208                exp.to_table(
1209                    f"`{catalog}`.`{schema.db}`.INFORMATION_SCHEMA.TABLES", dialect=self.dialect
1210                )
1211            )
1212            .join(
1213                "clustering_info",
1214                using=["table_catalog", "table_schema", "table_name"],
1215                join_type="left",
1216                join_alias="ci",
1217            )
1218        )
1219        if object_names:
1220            query = query.where(exp.column("table_name").isin(*object_names))
1221
1222        try:
1223            df = self.fetchdf(query, quote_identifiers=True)
1224        except Exception as e:
1225            if "Not found" in str(e):
1226                return []
1227            raise
1228
1229        if df.empty:
1230            return []
1231        return [
1232            DataObject(
1233                catalog=row.catalog,  # type: ignore
1234                schema=row.schema_name,  # type: ignore
1235                name=row.name,  # type: ignore
1236                type=DataObjectType.from_str(row.type),  # type: ignore
1237                clustering_key=f"({row.clustering_key})" if row.clustering_key else None,  # type: ignore
1238            )
1239            for row in df.itertuples()
1240        ]
1241
1242    def _update_clustering_key(self, operation: TableAlterClusterByOperation) -> None:
1243        cluster_key_expressions = getattr(operation, "cluster_key_expressions", [])
1244        bq_table = self._get_table(operation.target_table)
1245
1246        rendered_columns = [c.sql(dialect=self.dialect) for c in cluster_key_expressions]
1247        bq_table.clustering_fields = (
1248            rendered_columns or None
1249        )  # causes a drop of the key if cluster_by is empty or None
1250
1251        self._db_call(self.client.update_table, table=bq_table, fields=["clustering_fields"])
1252
1253        if cluster_key_expressions:
1254            # BigQuery only applies new clustering going forward, so this rewrites the columns to apply the new clustering to historical data
1255            # ref: https://cloud.google.com/bigquery/docs/creating-clustered-tables#modifying-cluster-spec
1256            self.execute(
1257                exp.update(
1258                    operation.target_table,
1259                    {c: c for c in cluster_key_expressions},
1260                    where=exp.true(),
1261                )
1262            )
1263
1264    def _normalize_decimal_value(self, col: exp.Expr, precision: int) -> exp.Expr:
1265        return exp.func("FORMAT", exp.Literal.string(f"%.{precision}f"), col)
1266
1267    def _normalize_nested_value(self, col: exp.Expr) -> exp.Expr:
1268        return exp.func("TO_JSON_STRING", col, dialect=self.dialect)
1269
1270    @t.overload
1271    def _columns_to_types(
1272        self,
1273        query_or_df: DF,
1274        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
1275        source_columns: t.Optional[t.List[str]] = None,
1276    ) -> t.Tuple[t.Dict[str, exp.DataType], t.List[str]]: ...
1277
1278    @t.overload
1279    def _columns_to_types(
1280        self,
1281        query_or_df: Query,
1282        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
1283        source_columns: t.Optional[t.List[str]] = None,
1284    ) -> t.Tuple[t.Optional[t.Dict[str, exp.DataType]], t.Optional[t.List[str]]]: ...
1285
1286    def _columns_to_types(
1287        self,
1288        query_or_df: QueryOrDF,
1289        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
1290        source_columns: t.Optional[t.List[str]] = None,
1291    ) -> t.Tuple[t.Optional[t.Dict[str, exp.DataType]], t.Optional[t.List[str]]]:
1292        if (
1293            not target_columns_to_types
1294            and bigframes
1295            and isinstance(query_or_df, bigframes.dataframe.DataFrame)
1296        ):
1297            # using dry_run=True attempts to prevent the DataFrame from being materialized just to read the column types from it
1298            dtypes = query_or_df.to_pandas(dry_run=True).columnDtypes
1299            target_columns_to_types = columns_to_types_from_dtypes(dtypes.items())
1300            return target_columns_to_types, list(source_columns or target_columns_to_types)
1301
1302        return super()._columns_to_types(
1303            query_or_df, target_columns_to_types, source_columns=source_columns
1304        )
1305
1306    def _native_df_to_pandas_df(
1307        self,
1308        query_or_df: QueryOrDF,
1309    ) -> t.Union[Query, pd.DataFrame]:
1310        if bigframes and isinstance(query_or_df, bigframes.dataframe.DataFrame):
1311            return query_or_df.to_pandas()
1312
1313        return super()._native_df_to_pandas_df(query_or_df)
1314
1315    @property
1316    def _query_data(self) -> t.Any:
1317        return self._connection_pool.get_attribute("query_data")
1318
1319    @_query_data.setter
1320    def _query_data(self, value: t.Any) -> None:
1321        self._connection_pool.set_attribute("query_data", value)
1322
1323    @property
1324    def _query_job(self) -> t.Optional[QueryJob]:
1325        return self._connection_pool.get_attribute("query_job")
1326
1327    @_query_job.setter
1328    def _query_job(self, value: t.Any) -> None:
1329        self._connection_pool.set_attribute("query_job", value)
1330
1331    @property
1332    def _session_id(self) -> t.Any:
1333        return self._connection_pool.get_attribute("session_id")
1334
1335    @_session_id.setter
1336    def _session_id(self, value: t.Any) -> None:
1337        self._connection_pool.set_attribute("session_id", value)
1338
1339    def _get_current_schema(self) -> str:
1340        raise NotImplementedError("BigQuery does not support current schema")
1341
1342    def _get_bq_dataset_location(self, project: str, dataset: str) -> str:
1343        return self._db_call(self.client.get_dataset, dataset_ref=f"{project}.{dataset}").location
1344
1345    def _get_grant_expression(self, table: exp.Table) -> exp.Expr:
1346        if not table.db:
1347            raise ValueError(
1348                f"Table {table.sql(dialect=self.dialect)} does not have a schema (dataset)"
1349            )
1350        project = table.catalog or self.get_current_catalog()
1351        if not project:
1352            raise ValueError(
1353                f"Table {table.sql(dialect=self.dialect)} does not have a catalog (project)"
1354            )
1355
1356        dataset = table.db
1357        table_name = table.name
1358        location = self._get_bq_dataset_location(project, dataset)
1359
1360        # https://cloud.google.com/bigquery/docs/information-schema-object-privileges
1361        # OBJECT_PRIVILEGES is a project-level INFORMATION_SCHEMA view with regional qualifier
1362        object_privileges_table = exp.to_table(
1363            f"`{project}`.`region-{location}`.INFORMATION_SCHEMA.{self.GRANT_INFORMATION_SCHEMA_TABLE_NAME}",
1364            dialect=self.dialect,
1365        )
1366        return (
1367            exp.select("privilege_type", "grantee")
1368            .from_(object_privileges_table)
1369            .where(
1370                exp.and_(
1371                    exp.column("object_schema").eq(exp.Literal.string(dataset)),
1372                    exp.column("object_name").eq(exp.Literal.string(table_name)),
1373                    # Filter out current_user
1374                    # BigQuery grantees format: "user:email" or "group:name"
1375                    exp.func("split", exp.column("grantee"), exp.Literal.string(":"))[
1376                        exp.func("OFFSET", exp.Literal.number("1"))
1377                    ].neq(self.CURRENT_USER_OR_ROLE_EXPRESSION),
1378                )
1379            )
1380        )
1381
1382    @staticmethod
1383    def _grant_object_kind(table_type: DataObjectType) -> str:
1384        if table_type == DataObjectType.VIEW:
1385            return "VIEW"
1386        if table_type == DataObjectType.MATERIALIZED_VIEW:
1387            # We actually need to use "MATERIALIZED VIEW" here even though it's not listed
1388            # as a supported resource_type in the BigQuery DCL doc:
1389            # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-control-language
1390            return "MATERIALIZED VIEW"
1391        return "TABLE"
1392
1393    def _dcl_grants_config_expr(
1394        self,
1395        dcl_cmd: t.Type[DCL],
1396        table: exp.Table,
1397        grants_config: GrantsConfig,
1398        table_type: DataObjectType = DataObjectType.TABLE,
1399    ) -> t.List[exp.Expr]:
1400        expressions: t.List[exp.Expr] = []
1401        if not grants_config:
1402            return expressions
1403
1404        # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-control-language
1405
1406        def normalize_principal(p: str) -> str:
1407            if ":" not in p:
1408                raise ValueError(f"Principal '{p}' missing a prefix label")
1409
1410            # allUsers and allAuthenticatedUsers special groups that are cas-sensitive and must start with "specialGroup:"
1411            if p.endswith("allUsers") or p.endswith("allAuthenticatedUsers"):
1412                if not p.startswith("specialGroup:"):
1413                    raise ValueError(
1414                        f"Special group principal '{p}' must start with 'specialGroup:' prefix label"
1415                    )
1416                return p
1417
1418            label, principal = p.split(":", 1)
1419            # always lowercase principals
1420            return f"{label}:{principal.lower()}"
1421
1422        object_kind = self._grant_object_kind(table_type)
1423        for privilege, principals in grants_config.items():
1424            if not principals:
1425                continue
1426
1427            noramlized_principals = [exp.Literal.string(normalize_principal(p)) for p in principals]
1428            args: t.Dict[str, t.Any] = {
1429                "privileges": [exp.GrantPrivilege(this=exp.to_identifier(privilege, quoted=True))],
1430                "securable": table.copy(),
1431                "principals": noramlized_principals,
1432            }
1433
1434            if object_kind:
1435                args["kind"] = exp.Var(this=object_kind)
1436
1437            expressions.append(dcl_cmd(**args))  # type: ignore[arg-type]
1438
1439        return expressions
1440
1441
1442class _ErrorCounter:
1443    """
1444    A class that counts errors and determines whether or not to retry based on the number of errors and the error
1445    type.
1446
1447    Reference implementation: https://github.com/dbt-labs/dbt-bigquery/blob/8339a034929b12e027f0a143abf46582f3f6ffbc/dbt/adapters/bigquery/connections.py#L672
1448
1449    TODO: Implement a retry configuration that works across all engines
1450    """
1451
1452    def __init__(self, num_retries: int) -> None:
1453        self.num_retries = num_retries
1454        self.error_count = 0
1455
1456    @property
1457    def retryable_errors(self) -> t.Tuple[t.Type[Exception], ...]:
1458        try:
1459            from google.cloud.exceptions import ServerError
1460        except ModuleNotFoundError:
1461            from google.api_core.exceptions import ServerError
1462        from requests.exceptions import ConnectionError
1463
1464        return (ServerError, ConnectionError)
1465
1466    def _is_retryable(self, error: BaseException) -> bool:
1467        from google.api_core.exceptions import Forbidden
1468
1469        if isinstance(error, self.retryable_errors):
1470            return True
1471        if isinstance(error, Forbidden) and any(
1472            e["reason"] == "rateLimitExceeded" for e in error.errors
1473        ):
1474            return True
1475        return False
1476
1477    def should_retry(self, error: BaseException) -> bool:
1478        if self.num_retries == 0:
1479            return False
1480        self.error_count += 1
1481        if self._is_retryable(error) and self.error_count <= self.num_retries:
1482            logger.info(f"Retry Num {self.error_count} of {self.num_retries}. Error: {repr(error)}")
1483            return True
1484        return False
1485
1486
1487def select_partitions_expr(
1488    schema: str,
1489    table_name: str,
1490    data_type: t.Union[str, exp.DataType],
1491    granularity: t.Optional[str] = None,
1492    agg_func: str = "MAX",
1493    catalog: t.Optional[str] = None,
1494) -> str:
1495    """Generates a SQL expression that aggregates partition values for a table.
1496
1497    Args:
1498        schema: The schema (BigQuery dataset) of the table.
1499        table_name: The name of the table.
1500        data_type: The data type of the partition column.
1501        granularity: The granularity of the partition. Supported values are: 'day', 'month', 'year' and 'hour'.
1502        agg_func: The aggregation function to use.
1503        catalog: The catalog (BigQuery project ID) of the table.
1504
1505    Returns:
1506        A SELECT statement that aggregates partition values for a table.
1507    """
1508    partitions_table_name = f"`{schema}`.INFORMATION_SCHEMA.PARTITIONS"
1509    if catalog:
1510        partitions_table_name = f"`{catalog}`.{partitions_table_name}"
1511
1512    if isinstance(data_type, exp.DataType):
1513        data_type = data_type.sql(dialect="bigquery")
1514    data_type = data_type.upper()
1515
1516    parse_fun = f"PARSE_{data_type}" if data_type in ("DATE", "DATETIME", "TIMESTAMP") else None
1517    if parse_fun:
1518        granularity = granularity or "day"
1519        parse_format = GRANULARITY_TO_PARTITION_FORMAT[granularity.lower()]
1520        partition_expr = exp.func(
1521            parse_fun,
1522            exp.Literal.string(parse_format),
1523            exp.column("partition_id"),
1524            dialect="bigquery",
1525        )
1526    else:
1527        partition_expr = exp.cast(exp.column("partition_id"), "INT64", dialect="bigquery")
1528
1529    return (
1530        exp.select(exp.func(agg_func, partition_expr))
1531        .from_(partitions_table_name, dialect="bigquery")
1532        .where(
1533            f"table_name = '{table_name}' AND partition_id IS NOT NULL AND partition_id != '__NULL__'",
1534            copy=False,
1535        )
1536        .sql(dialect="bigquery")
1537    )
1538
1539
1540GRANULARITY_TO_PARTITION_FORMAT = {
1541    "day": "%Y%m%d",
1542    "month": "%Y%m",
1543    "year": "%Y",
1544    "hour": "%Y%m%d%H",
1545}

logger = <Logger sqlmesh.core.engine_adapter.bigquery (WARNING)>

NestedField = typing.Tuple[str, str, typing.List[str]]

NestedFieldsDict = typing.Dict[str, typing.List[typing.Tuple[str, str, typing.List[str]]]]

def select_partitions_expr( schema: str, table_name: str, data_type: Union[str, sqlglot.expressions.datatypes.DataType], granularity: Optional[str] = None, agg_func: str = 'MAX', catalog: Optional[str] = None) -> str: View Source

1488def select_partitions_expr(
1489    schema: str,
1490    table_name: str,
1491    data_type: t.Union[str, exp.DataType],
1492    granularity: t.Optional[str] = None,
1493    agg_func: str = "MAX",
1494    catalog: t.Optional[str] = None,
1495) -> str:
1496    """Generates a SQL expression that aggregates partition values for a table.
1497
1498    Args:
1499        schema: The schema (BigQuery dataset) of the table.
1500        table_name: The name of the table.
1501        data_type: The data type of the partition column.
1502        granularity: The granularity of the partition. Supported values are: 'day', 'month', 'year' and 'hour'.
1503        agg_func: The aggregation function to use.
1504        catalog: The catalog (BigQuery project ID) of the table.
1505
1506    Returns:
1507        A SELECT statement that aggregates partition values for a table.
1508    """
1509    partitions_table_name = f"`{schema}`.INFORMATION_SCHEMA.PARTITIONS"
1510    if catalog:
1511        partitions_table_name = f"`{catalog}`.{partitions_table_name}"
1512
1513    if isinstance(data_type, exp.DataType):
1514        data_type = data_type.sql(dialect="bigquery")
1515    data_type = data_type.upper()
1516
1517    parse_fun = f"PARSE_{data_type}" if data_type in ("DATE", "DATETIME", "TIMESTAMP") else None
1518    if parse_fun:
1519        granularity = granularity or "day"
1520        parse_format = GRANULARITY_TO_PARTITION_FORMAT[granularity.lower()]
1521        partition_expr = exp.func(
1522            parse_fun,
1523            exp.Literal.string(parse_format),
1524            exp.column("partition_id"),
1525            dialect="bigquery",
1526        )
1527    else:
1528        partition_expr = exp.cast(exp.column("partition_id"), "INT64", dialect="bigquery")
1529
1530    return (
1531        exp.select(exp.func(agg_func, partition_expr))
1532        .from_(partitions_table_name, dialect="bigquery")
1533        .where(
1534            f"table_name = '{table_name}' AND partition_id IS NOT NULL AND partition_id != '__NULL__'",
1535            copy=False,
1536        )
1537        .sql(dialect="bigquery")
1538    )

Generates a SQL expression that aggregates partition values for a table.

Arguments:

schema: The schema (BigQuery dataset) of the table.
table_name: The name of the table.
data_type: The data type of the partition column.
granularity: The granularity of the partition. Supported values are: 'day', 'month', 'year' and 'hour'.
agg_func: The aggregation function to use.
catalog: The catalog (BigQuery project ID) of the table.

Returns:

A SELECT statement that aggregates partition values for a table.

GRANULARITY_TO_PARTITION_FORMAT = {'day': '%Y%m%d', 'month': '%Y%m', 'year': '%Y', 'hour': '%Y%m%d%H'}

sqlmesh.core.engine_adapter.bigquery

Arguments:

Inherited Members

Arguments:

Returns: