Edit on GitHub
sqlmesh.core.engine_adapter.redshift

View Source
  1from __future__ import annotations
  2
  3import logging
  4import typing as t
  5
  6from sqlglot import exp
  7from sqlglot.helper import ensure_list
  8
  9from sqlmesh.core.dialect import to_schema
 10from sqlmesh.core.engine_adapter.base import MERGE_SOURCE_ALIAS, MERGE_TARGET_ALIAS
 11from sqlmesh.core.engine_adapter.base_postgres import BasePostgresEngineAdapter
 12from sqlmesh.core.engine_adapter.mixins import (
 13    GetCurrentCatalogFromFunctionMixin,
 14    NonTransactionalTruncateMixin,
 15    VarcharSizeWorkaroundMixin,
 16    RowDiffMixin,
 17    logical_merge,
 18    GrantsFromInfoSchemaMixin,
 19)
 20from sqlmesh.core.engine_adapter.shared import (
 21    CommentCreationView,
 22    DataObject,
 23    DataObjectType,
 24    SourceQuery,
 25    set_catalog,
 26)
 27from sqlmesh.utils.errors import SQLMeshError
 28
 29if t.TYPE_CHECKING:
 30    import pandas as pd
 31
 32    from sqlmesh.core._typing import SchemaName, TableName
 33    from sqlmesh.core.engine_adapter.base import QueryOrDF, Query
 34    from sqlmesh.core.node import IntervalUnit
 35
 36logger = logging.getLogger(__name__)
 37
 38
 39@set_catalog()
 40class RedshiftEngineAdapter(
 41    BasePostgresEngineAdapter,
 42    GetCurrentCatalogFromFunctionMixin,
 43    NonTransactionalTruncateMixin,
 44    VarcharSizeWorkaroundMixin,
 45    RowDiffMixin,
 46    GrantsFromInfoSchemaMixin,
 47):
 48    DIALECT = "redshift"
 49    CURRENT_CATALOG_EXPRESSION = exp.func("current_database")
 50    # Redshift doesn't support comments for VIEWs WITH NO SCHEMA BINDING (which we always use)
 51    COMMENT_CREATION_VIEW = CommentCreationView.UNSUPPORTED
 52    SUPPORTS_REPLACE_TABLE = False
 53    SUPPORTS_GRANTS = True
 54    SUPPORTS_MULTIPLE_GRANT_PRINCIPALS = True
 55
 56    SCHEMA_DIFFER_KWARGS = {
 57        "parameterized_type_defaults": {
 58            exp.DataType.build("VARBYTE", dialect=DIALECT).this: [(64000,)],
 59            exp.DataType.build("DECIMAL", dialect=DIALECT).this: [(18, 0), (0,)],
 60            exp.DataType.build("CHAR", dialect=DIALECT).this: [(1,)],
 61            exp.DataType.build("VARCHAR", dialect=DIALECT).this: [(256,)],
 62            exp.DataType.build("NCHAR", dialect=DIALECT).this: [(1,)],
 63            exp.DataType.build("NVARCHAR", dialect=DIALECT).this: [(256,)],
 64        },
 65        "max_parameter_length": {
 66            exp.DataType.build("CHAR", dialect=DIALECT).this: 4096,
 67            exp.DataType.build("VARCHAR", dialect=DIALECT).this: 65535,
 68        },
 69        "precision_increase_allowed_types": {exp.DataType.build("VARCHAR", dialect=DIALECT).this},
 70        "drop_cascade": True,
 71    }
 72    VARIABLE_LENGTH_DATA_TYPES = {
 73        "char",
 74        "character",
 75        "nchar",
 76        "varchar",
 77        "character varying",
 78        "nvarchar",
 79        "varbyte",
 80        "varbinary",
 81        "binary varying",
 82    }
 83
 84    def columns(
 85        self,
 86        table_name: TableName,
 87        include_pseudo_columns: bool = True,
 88    ) -> t.Dict[str, exp.DataType]:
 89        table = exp.to_table(table_name)
 90
 91        sql = (
 92            exp.select(
 93                "column_name",
 94                "data_type",
 95                "character_maximum_length",
 96                "numeric_precision",
 97                "numeric_scale",
 98            )
 99            .from_("svv_columns")  # Includes late-binding views
100            .where(exp.column("table_name").eq(table.alias_or_name))
101        )
102        if table.args.get("db"):
103            sql = sql.where(exp.column("table_schema").eq(table.args["db"].name))
104
105        columns_raw = self.fetchall(sql, quote_identifiers=True)
106
107        def build_var_length_col(
108            column_name: str,
109            data_type: str,
110            character_maximum_length: t.Optional[int] = None,
111            numeric_precision: t.Optional[int] = None,
112            numeric_scale: t.Optional[int] = None,
113        ) -> tuple:
114            data_type = data_type.lower()
115            if (
116                data_type in self.VARIABLE_LENGTH_DATA_TYPES
117                and character_maximum_length is not None
118            ):
119                return (column_name, f"{data_type}({character_maximum_length})")
120            if data_type in ("decimal", "numeric"):
121                return (column_name, f"{data_type}({numeric_precision}, {numeric_scale})")
122
123            return (column_name, data_type)
124
125        columns = [build_var_length_col(*row) for row in columns_raw]
126
127        return {
128            column_name: exp.DataType.build(data_type, dialect=self.dialect)
129            for column_name, data_type in columns
130        }
131
132    @property
133    def enable_merge(self) -> bool:
134        # Redshift supports the MERGE operation but we use the logical merge
135        # unless the user has opted in by setting enable_merge in the connection.
136        return bool(self._extra_config.get("enable_merge"))
137
138    @property
139    def cursor(self) -> t.Any:
140        # Redshift by default uses a `format` paramstyle that has issues when we try to write our snapshot
141        # data to snapshot table. There doesn't seem to be a way to disable parameter overriding so we just
142        # set it to `qmark` since that doesn't cause issues.
143        cursor = self._connection_pool.get_cursor()
144        cursor.paramstyle = "qmark"
145        return cursor
146
147    def _fetch_native_df(
148        self, query: t.Union[exp.Expr, str], quote_identifiers: bool = False
149    ) -> pd.DataFrame:
150        """Fetches a Pandas DataFrame from the cursor"""
151        import pandas as pd
152
153        self.execute(query, quote_identifiers=quote_identifiers)
154
155        # We manually build the `DataFrame` here because the driver's `fetch_dataframe`
156        # method does not respect the active case-sensitivity configuration.
157        #
158        # Context: https://github.com/aws/amazon-redshift-python-driver/issues/238
159        fetcheddata = self.cursor.fetchall()
160
161        try:
162            columns = [column[0] for column in self.cursor.description]
163        except Exception:
164            columns = None
165            logging.warning(
166                "No row description was found, pandas dataframe will be missing column labels."
167            )
168
169        result = [tuple(row) for row in fetcheddata]
170        return pd.DataFrame(result, columns=columns)
171
172    def _create_table_from_source_queries(
173        self,
174        table_name: TableName,
175        source_queries: t.List[SourceQuery],
176        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
177        exists: bool = True,
178        replace: bool = False,
179        table_description: t.Optional[str] = None,
180        column_descriptions: t.Optional[t.Dict[str, str]] = None,
181        table_kind: t.Optional[str] = None,
182        track_rows_processed: bool = True,
183        **kwargs: t.Any,
184    ) -> None:
185        """
186        Redshift doesn't support `CREATE TABLE IF NOT EXISTS AS...` but does support `CREATE TABLE AS...` so
187        we check if the exists check exists and if not then we can use the base implementation. Otherwise we
188        manually check if it exists and if it does then this is a no-op anyways so we return and if it doesn't
189        then we run the query with exists set to False since we just confirmed it doesn't exist.
190        """
191        if not exists:
192            return super()._create_table_from_source_queries(
193                table_name,
194                source_queries,
195                target_columns_to_types,
196                exists,
197                table_description=table_description,
198                column_descriptions=column_descriptions,
199                **kwargs,
200            )
201        if self.table_exists(table_name):
202            return
203        super()._create_table_from_source_queries(
204            table_name,
205            source_queries,
206            exists=False,
207            table_description=table_description,
208            column_descriptions=column_descriptions,
209            **kwargs,
210        )
211
212    def create_view(
213        self,
214        view_name: TableName,
215        query_or_df: QueryOrDF,
216        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
217        replace: bool = True,
218        materialized: bool = False,
219        materialized_properties: t.Optional[t.Dict[str, t.Any]] = None,
220        table_description: t.Optional[str] = None,
221        column_descriptions: t.Optional[t.Dict[str, str]] = None,
222        view_properties: t.Optional[t.Dict[str, exp.Expr]] = None,
223        source_columns: t.Optional[t.List[str]] = None,
224        **create_kwargs: t.Any,
225    ) -> None:
226        """
227        Redshift views are "binding" by default to their underlying table which means you can't drop that
228        underlying table without dropping the view first. This is a problem for us since we want to be able to
229        swap tables out from under views. Therefore, we create the view as non-binding.
230        """
231        no_schema_binding = True
232        if isinstance(query_or_df, exp.Expr):
233            # We can't include NO SCHEMA BINDING if the query has a recursive CTE
234            has_recursive_cte = any(
235                w.args.get("recursive", False) for w in query_or_df.find_all(exp.With)
236            )
237            no_schema_binding = not has_recursive_cte
238
239        return super().create_view(
240            view_name,
241            query_or_df,
242            target_columns_to_types,
243            replace,
244            materialized,
245            materialized_properties,
246            table_description=table_description,
247            column_descriptions=column_descriptions,
248            no_schema_binding=no_schema_binding,
249            view_properties=view_properties,
250            source_columns=source_columns,
251            **create_kwargs,
252        )
253
254    def _build_table_properties_exp(
255        self,
256        catalog_name: t.Optional[str] = None,
257        table_format: t.Optional[str] = None,
258        storage_format: t.Optional[str] = None,
259        partitioned_by: t.Optional[t.List[exp.Expr]] = None,
260        partition_interval_unit: t.Optional[IntervalUnit] = None,
261        clustered_by: t.Optional[t.List[exp.Expr]] = None,
262        table_properties: t.Optional[t.Dict[str, exp.Expr]] = None,
263        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
264        table_description: t.Optional[str] = None,
265        table_kind: t.Optional[str] = None,
266        **kwargs: t.Any,
267    ) -> t.Optional[exp.Properties]:
268        properties: t.List[exp.Expr] = []
269
270        if table_description:
271            properties.append(
272                exp.SchemaCommentProperty(
273                    this=exp.Literal.string(self._truncate_table_comment(table_description))
274                )
275            )
276
277        def _to_identifier_if_string(expression: exp.Expr) -> exp.Expr:
278            if isinstance(expression, exp.Literal) and expression.is_string:
279                return exp.to_identifier(expression.this)
280            return expression.copy()
281
282        if table_properties:
283            table_properties = {k.upper(): v for k, v in table_properties.items()}
284
285            table_type = self._pop_creatable_type_from_properties(table_properties)
286            properties.extend(ensure_list(table_type))
287
288            diststyle = table_properties.get("DISTSTYLE")
289            if diststyle:
290                properties.append(exp.DistStyleProperty(this=exp.var(diststyle.name.upper())))
291
292            distkey = table_properties.get("DISTKEY")
293            if distkey:
294                properties.append(exp.DistKeyProperty(this=_to_identifier_if_string(distkey)))
295
296            sortkey = table_properties.get("SORTKEY")
297            if sortkey:
298                sortkey_expressions = sortkey.expressions if sortkey.expressions else [sortkey]
299                properties.append(
300                    exp.SortKeyProperty(
301                        this=[
302                            _to_identifier_if_string(expression)
303                            for expression in sortkey_expressions
304                        ],
305                        compound=False,
306                    )
307                )
308
309        return exp.Properties(expressions=properties) if properties else None
310
311    def replace_query(
312        self,
313        table_name: TableName,
314        query_or_df: QueryOrDF,
315        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
316        table_description: t.Optional[str] = None,
317        column_descriptions: t.Optional[t.Dict[str, str]] = None,
318        source_columns: t.Optional[t.List[str]] = None,
319        supports_replace_table_override: t.Optional[bool] = None,
320        **kwargs: t.Any,
321    ) -> None:
322        """
323        Redshift doesn't support `CREATE OR REPLACE TABLE...` and it also doesn't support `VALUES` expression so we need to specially
324        handle DataFrame replacements.
325
326        If the table doesn't exist then we just create it and load it with insert statements
327        If it does exist then we need to do the:
328            `CREATE TABLE...`, `INSERT INTO...`, `RENAME TABLE...`, `RENAME TABLE...`, DROP TABLE...`  dance.
329        """
330        import pandas as pd
331
332        target_data_object = self.get_data_object(table_name)
333        table_exists = target_data_object is not None
334        if self.drop_data_object_on_type_mismatch(target_data_object, DataObjectType.TABLE):
335            table_exists = False
336
337        if not isinstance(query_or_df, pd.DataFrame) or not table_exists:
338            return super().replace_query(
339                table_name,
340                query_or_df,
341                target_columns_to_types,
342                table_description,
343                column_descriptions,
344                source_columns=source_columns,
345                **kwargs,
346            )
347        source_queries, target_columns_to_types = self._get_source_queries_and_columns_to_types(
348            query_or_df,
349            target_columns_to_types,
350            target_table=table_name,
351            source_columns=source_columns,
352        )
353        target_columns_to_types = target_columns_to_types or self.columns(table_name)
354        target_table = exp.to_table(table_name)
355        with self.transaction():
356            temp_table = self._get_temp_table(target_table)
357            old_table = self._get_temp_table(target_table)
358            self.create_table(
359                temp_table,
360                target_columns_to_types,
361                exists=False,
362                table_description=table_description,
363                column_descriptions=column_descriptions,
364                **kwargs,
365            )
366            self._insert_append_source_queries(temp_table, source_queries, target_columns_to_types)
367            self.rename_table(target_table, old_table)
368            self.rename_table(temp_table, target_table)
369            self.drop_table(old_table)
370
371    def _get_data_objects(
372        self, schema_name: SchemaName, object_names: t.Optional[t.Set[str]] = None
373    ) -> t.List[DataObject]:
374        """
375        Returns all the data objects that exist in the given schema and optionally catalog.
376        """
377        catalog = self.get_current_catalog()
378        table_query = exp.select(
379            exp.column("schemaname").as_("schema_name"),
380            exp.column("tablename").as_("name"),
381            exp.Literal.string("TABLE").as_("type"),
382        ).from_("pg_tables")
383        view_query = (
384            exp.select(
385                exp.column("schemaname").as_("schema_name"),
386                exp.column("viewname").as_("name"),
387                exp.Literal.string("VIEW").as_("type"),
388            )
389            .from_("pg_views")
390            .where(exp.column("definition").ilike("%create materialized view%").not_())
391        )
392        materialized_view_query = (
393            exp.select(
394                exp.column("schemaname").as_("schema_name"),
395                exp.column("viewname").as_("name"),
396                exp.Literal.string("MATERIALIZED_VIEW").as_("type"),
397            )
398            .from_("pg_views")
399            .where(exp.column("definition").ilike("%create materialized view%"))
400        )
401        subquery = exp.union(
402            table_query,
403            exp.union(view_query, materialized_view_query, distinct=False),
404            distinct=False,
405        )
406        query = (
407            exp.select("*")
408            .from_(subquery.subquery(alias="objs"))
409            .where(exp.column("schema_name").eq(to_schema(schema_name).db))
410        )
411        if object_names:
412            query = query.where(exp.column("name").isin(*object_names))
413        df = self.fetchdf(query)
414        return [
415            DataObject(
416                catalog=catalog,
417                schema=row.schema_name,
418                name=row.name,
419                type=DataObjectType.from_str(row.type),  # type: ignore
420            )
421            for row in df.itertuples()
422        ]
423
424    def merge(
425        self,
426        target_table: TableName,
427        source_table: QueryOrDF,
428        target_columns_to_types: t.Optional[t.Dict[str, exp.DataType]],
429        unique_key: t.Sequence[exp.Expr],
430        when_matched: t.Optional[exp.Whens] = None,
431        merge_filter: t.Optional[exp.Expr] = None,
432        source_columns: t.Optional[t.List[str]] = None,
433        **kwargs: t.Any,
434    ) -> None:
435        if self.enable_merge:
436            # By default we use the logical merge unless the user has opted in
437            super().merge(
438                target_table=target_table,
439                source_table=source_table,
440                target_columns_to_types=target_columns_to_types,
441                unique_key=unique_key,
442                when_matched=when_matched,
443                merge_filter=merge_filter,
444                source_columns=source_columns,
445            )
446        else:
447            logical_merge(
448                self,
449                target_table,
450                source_table,
451                target_columns_to_types,
452                unique_key,
453                when_matched=when_matched,
454                merge_filter=merge_filter,
455                source_columns=source_columns,
456            )
457
458    def _merge(
459        self,
460        target_table: TableName,
461        query: Query,
462        on: exp.Expr,
463        whens: exp.Whens,
464    ) -> None:
465        # Redshift does not support table aliases in the target table of a MERGE statement.
466        # So we must use the actual table name instead of an alias, as we do with the source table.
467        def resolve_target_table(expression: exp.Expr) -> exp.Expr:
468            if (
469                isinstance(expression, exp.Column)
470                and expression.table.upper() == MERGE_TARGET_ALIAS
471            ):
472                expression.set("table", exp.to_table(target_table))
473            return expression
474
475        # Ensure that there is exactly one "WHEN MATCHED" and one "WHEN NOT MATCHED" clause.
476        # Since Redshift does not support multiple "WHEN MATCHED" clauses.
477        if (
478            len(whens.expressions) != 2
479            or whens.expressions[0].args["matched"] == whens.expressions[1].args["matched"]
480        ):
481            raise SQLMeshError(
482                "Redshift only supports a single WHEN MATCHED and WHEN NOT MATCHED clause"
483            )
484
485        using = exp.alias_(
486            exp.Subquery(this=query), alias=MERGE_SOURCE_ALIAS, copy=False, table=True
487        )
488        self.execute(
489            exp.Merge(
490                this=target_table,
491                using=using,
492                on=on.transform(resolve_target_table),
493                whens=whens.transform(resolve_target_table),
494            ),
495            track_rows_processed=True,
496        )
497
498    def _normalize_decimal_value(self, expr: exp.Expr, precision: int) -> exp.Expr:
499        # Redshift is finicky. It truncates when the data is already in a table, but rounds when the data is generated as part of a SELECT.
500        #
501        # The following works:
502        #  > select cast(cast(3.14159 as decimal(6, 5)) as decimal(6, 3)); --produces '3.142', the value we want / what every other database produces
503        #
504        # However, if you write that to a table, and then cast it to a less precise decimal, you get _truncation_.
505        #  > create table foo (val decimal(6, 5)); insert into foo(val) values (3.14159);
506        #  > select cast(val as decimal(6, 3)) from foo; --produces '3.141'
507        #
508        # So to make up for this, we force it to round by injecting a round() expression
509        rounded = exp.func("ROUND", expr, precision)
510
511        return super()._normalize_decimal_value(rounded, precision)
logger = <Logger sqlmesh.core.engine_adapter.redshift (WARNING)>
sqlmesh.core.engine_adapter.redshift

Arguments:

If it does exist then we need to do the:

Inherited Members