Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 108 additions & 1 deletion bigframes/bigquery/_operations/ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from __future__ import annotations

import json
from typing import Any, Iterable, List, Literal, Mapping, Tuple, Union
from typing import Any, Iterable, List, Literal, Mapping, Optional, Tuple, Union

import pandas as pd

Expand All @@ -28,6 +28,7 @@
from bigframes import series, session
from bigframes.core import convert
from bigframes.core.logging import log_adapter
import bigframes.core.sql.literals
from bigframes.ml import core as ml_core
from bigframes.operations import ai_ops, output_schemas

Expand Down Expand Up @@ -388,6 +389,112 @@ def generate_double(
return series_list[0]._apply_nary_op(operator, series_list[1:])


@log_adapter.method_logger(custom_base_name="bigquery_ai")
def generate_embedding(
model_name: str,
data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series],
*,
output_dimensionality: Optional[int] = None,
task_type: Optional[str] = None,
start_second: Optional[float] = None,
end_second: Optional[float] = None,
interval_seconds: Optional[float] = None,
trial_id: Optional[int] = None,
) -> dataframe.DataFrame:
"""
Creates embeddings that describe an entity—for example, a piece of text or an image.

**Examples:**

>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]})
>>> bbq.ai.generate_embedding(
... "project.dataset.model_name",
... df
... ) # doctest: +SKIP

Args:
model_name (str):
The name of a remote model over a Vertex AI multimodalembedding@001 model.
data (bigframes.pandas.DataFrame or bigframes.pandas.Series):
The data to generate embeddings for. If a Series is provided, it is
treated as the 'content' column. If a DataFrame is provided, it
must contain a 'content' column, or you must rename the column you
wish to embed to 'content'.
output_dimensionality (int, optional):
An INT64 value that specifies the number of dimensions to use when
generating embeddings. For example, if you specify 256 AS
output_dimensionality, then the embedding output column contains a
256-dimensional embedding for each input value. To find the
supported range of output dimensions, read about the available
`Google text embedding models <https://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#google-models>`_.
task_type (str, optional):
A STRING literal that specifies the intended downstream application to
help the model produce better quality embeddings. For a list of
supported task types and how to choose which one to use, see `Choose an
embeddings task type <http://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/task-types>`_.
start_second (float, optional):
The second in the video at which to start the embedding. The default value is 0.
end_second (float, optional):
The second in the video at which to end the embedding. The default value is 120.
interval_seconds (float, optional):
The interval to use when creating embeddings. The default value is 16.
trial_id (int, optional):
An INT64 value that identifies the hyperparameter tuning trial that
you want the function to evaluate. The function uses the optimal
trial by default. Only specify this argument if you ran
hyperparameter tuning when creating the model.

Returns:
bigframes.pandas.DataFrame:
A new DataFrame with the generated embeddings. See the `SQL
reference for AI.GENERATE_EMBEDDING
<https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-embedding#output>`_
for details.
"""
if isinstance(data, (pd.DataFrame, pd.Series)):
data = bpd.read_pandas(data)

if isinstance(data, series.Series):
data = data.copy()
data.name = "content"
data_df = data.to_frame()
elif isinstance(data, dataframe.DataFrame):
data_df = data
else:
raise ValueError(f"Unsupported data type: {type(data)}")

# We need to get the SQL for the input data to pass as a subquery to the TVF
source_sql = data_df.sql

struct_fields = {}
if output_dimensionality is not None:
struct_fields["OUTPUT_DIMENSIONALITY"] = output_dimensionality
if task_type is not None:
struct_fields["TASK_TYPE"] = task_type
if start_second is not None:
struct_fields["START_SECOND"] = start_second
if end_second is not None:
struct_fields["END_SECOND"] = end_second
if interval_seconds is not None:
struct_fields["INTERVAL_SECONDS"] = interval_seconds
if trial_id is not None:
struct_fields["TRIAL_ID"] = trial_id

# Construct the TVF query
query = f"""
SELECT *
FROM AI.GENERATE_EMBEDDING(
MODEL `{model_name}`,
({source_sql}),
{bigframes.core.sql.literals.struct_literal(struct_fields)})
)
"""

return data_df._session.read_gbq(query)


@log_adapter.method_logger(custom_base_name="bigquery_ai")
def if_(
prompt: PROMPT_TYPE,
Expand Down
3 changes: 2 additions & 1 deletion bigframes/core/pyformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from bigframes.core import utils
import bigframes.core.local_data
import bigframes.core.sql.literals
from bigframes.core.tools import bigquery_schema
import bigframes.session

Expand Down Expand Up @@ -120,7 +121,7 @@ def _validate_type(name: str, value: Any):

supported_types = (
typing.get_args(_BQ_TABLE_TYPES)
+ typing.get_args(bigframes.core.sql.SIMPLE_LITERAL_TYPES)
+ typing.get_args(bigframes.core.sql.literals.SIMPLE_LITERAL_TYPES)
+ (bigframes.dataframe.DataFrame,)
+ (pandas.DataFrame,)
)
Expand Down
74 changes: 1 addition & 73 deletions bigframes/core/sql/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,91 +17,19 @@
Utility functions for SQL construction.
"""

import datetime
import decimal
import json
import math
from typing import cast, Collection, Iterable, Mapping, Optional, TYPE_CHECKING, Union

import shapely.geometry.base # type: ignore

import bigframes.core.compile.googlesql as googlesql
from bigframes.core.sql.literals import simple_literal

if TYPE_CHECKING:
import google.cloud.bigquery as bigquery

import bigframes.core.ordering


# shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0.
try:
from shapely.io import to_wkt # type: ignore
except ImportError:
from shapely.wkt import dumps # type: ignore

to_wkt = dumps


SIMPLE_LITERAL_TYPES = Union[
bytes,
str,
int,
bool,
float,
datetime.datetime,
datetime.date,
datetime.time,
decimal.Decimal,
list,
]


### Writing SQL Values (literals, column references, table references, etc.)
def simple_literal(value: Union[SIMPLE_LITERAL_TYPES, None]) -> str:
"""Return quoted input string."""

# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals
if value is None:
return "NULL"
elif isinstance(value, str):
# Single quoting seems to work nicer with ibis than double quoting
return f"'{googlesql._escape_chars(value)}'"
elif isinstance(value, bytes):
return repr(value)
elif isinstance(value, (bool, int)):
return str(value)
elif isinstance(value, float):
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#floating_point_literals
if math.isnan(value):
return 'CAST("nan" as FLOAT)'
if value == math.inf:
return 'CAST("+inf" as FLOAT)'
if value == -math.inf:
return 'CAST("-inf" as FLOAT)'
return str(value)
# Check datetime first as it is a subclass of date
elif isinstance(value, datetime.datetime):
if value.tzinfo is None:
return f"DATETIME('{value.isoformat()}')"
else:
return f"TIMESTAMP('{value.isoformat()}')"
elif isinstance(value, datetime.date):
return f"DATE('{value.isoformat()}')"
elif isinstance(value, datetime.time):
return f"TIME(DATETIME('1970-01-01 {value.isoformat()}'))"
elif isinstance(value, shapely.geometry.base.BaseGeometry):
return f"ST_GEOGFROMTEXT({simple_literal(to_wkt(value))})"
elif isinstance(value, decimal.Decimal):
# TODO: disambiguate BIGNUMERIC based on scale and/or precision
return f"CAST('{str(value)}' AS NUMERIC)"
elif isinstance(value, list):
simple_literals = [simple_literal(i) for i in value]
return f"[{', '.join(simple_literals)}]"

else:
raise ValueError(f"Cannot produce literal for {value}")


def multi_literal(*values: str):
literal_strings = [simple_literal(i) for i in values]
return "(" + ", ".join(literal_strings) + ")"
Expand Down
99 changes: 99 additions & 0 deletions bigframes/core/sql/literals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import datetime
import decimal
import math
from typing import Mapping, Union

import shapely.geometry.base # type: ignore

import bigframes.core.compile.googlesql as googlesql

# shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0.
try:
from shapely.io import to_wkt # type: ignore
except ImportError:
from shapely.wkt import dumps # type: ignore

to_wkt = dumps


SIMPLE_LITERAL_TYPES = Union[
bytes,
str,
int,
bool,
float,
datetime.datetime,
datetime.date,
datetime.time,
decimal.Decimal,
list,
]


def simple_literal(value: Union[SIMPLE_LITERAL_TYPES, None]) -> str:
"""Return quoted input string."""

# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals
if value is None:
return "NULL"
elif isinstance(value, str):
# Single quoting seems to work nicer with ibis than double quoting
return f"'{googlesql._escape_chars(value)}'"
elif isinstance(value, bytes):
return repr(value)
elif isinstance(value, (bool, int)):
return str(value)
elif isinstance(value, float):
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#floating_point_literals
if math.isnan(value):
return 'CAST("nan" as FLOAT)'
if value == math.inf:
return 'CAST("+inf" as FLOAT)'
if value == -math.inf:
return 'CAST("-inf" as FLOAT)'
return str(value)
# Check datetime first as it is a subclass of date
elif isinstance(value, datetime.datetime):
if value.tzinfo is None:
return f"DATETIME('{value.isoformat()}')"
else:
return f"TIMESTAMP('{value.isoformat()}')"
elif isinstance(value, datetime.date):
return f"DATE('{value.isoformat()}')"
elif isinstance(value, datetime.time):
return f"TIME(DATETIME('1970-01-01 {value.isoformat()}'))"
elif isinstance(value, shapely.geometry.base.BaseGeometry):
return f"ST_GEOGFROMTEXT({simple_literal(to_wkt(value))})"
elif isinstance(value, decimal.Decimal):
# TODO: disambiguate BIGNUMERIC based on scale and/or precision
return f"CAST('{str(value)}' AS NUMERIC)"
elif isinstance(value, list):
simple_literals = [simple_literal(i) for i in value]
return f"[{', '.join(simple_literals)}]"

else:
raise ValueError(f"Cannot produce literal for {value}")


def struct_literal(struct_options: Mapping[str, SIMPLE_LITERAL_TYPES]) -> str:
rendered_options = []
for option_name, option_value in struct_options.items():
rendered_val = simple_literal(option_value)
rendered_options.append(f"{rendered_val} AS {option_name}")
return f"STRUCT({', '.join(rendered_options)})"
7 changes: 2 additions & 5 deletions bigframes/core/sql/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import bigframes.core.compile.googlesql as googlesql
import bigframes.core.sql
import bigframes.core.sql.literals


def create_model_ddl(
Expand Down Expand Up @@ -105,11 +106,7 @@ def _build_struct_sql(
if not struct_options:
return ""

rendered_options = []
for option_name, option_value in struct_options.items():
rendered_val = bigframes.core.sql.simple_literal(option_value)
rendered_options.append(f"{rendered_val} AS {option_name}")
return f", STRUCT({', '.join(rendered_options)})"
return f", {bigframes.core.sql.literals.struct_literal}"


def evaluate(
Expand Down
Loading
Loading