Skip to content

Commit b5a520c

Browse files
authored
more duckdb data types, use sets instead (#7326)
## 📝 Summary <!-- Provide a concise summary of what this pull request is addressing. If this PR fixes any issues, list them here by number (e.g., Fixes #123). --> This is based on https://github.com/marimo-team/codemirror-sql/blob/caa7c664135988b634f55a3e57a1327a5ffeede2/src/dialects/duckdb/duckdb.ts ## 🔍 Description of Changes <!-- Detail the specific changes made in this pull request. Explain the problem addressed and how it was resolved. If applicable, provide before and after comparisons, screenshots, or any relevant details to help reviewers understand the changes easily. --> ## 📋 Checklist - [x] I have read the [contributor guidelines](https://github.com/marimo-team/marimo/blob/main/CONTRIBUTING.md). - [ ] For large changes, or changes that affect the public API: this change was discussed or approved through an issue, on [Discord](https://marimo.io/discord?ref=pr), or the community [discussions](https://github.com/marimo-team/marimo/discussions) (Please provide a link if applicable). - [x] I have added tests for the changes made. - [x] I have run the code and verified that it works as expected.
1 parent 75ac90c commit b5a520c

File tree

2 files changed

+109
-49
lines changed

2 files changed

+109
-49
lines changed

marimo/_data/get_datasets.py

Lines changed: 78 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -453,83 +453,112 @@ def _get_duckdb_database_names(
453453
return []
454454

455455

456+
_INTEGER_TYPES = {
457+
"tinyint",
458+
"smallint",
459+
"integer",
460+
"bigint",
461+
"hugeint",
462+
"integral",
463+
"long",
464+
"short",
465+
"signed",
466+
"oid",
467+
"varint",
468+
"int",
469+
"int1",
470+
"int2",
471+
"int4",
472+
"int8",
473+
"int16",
474+
"int32",
475+
"int64",
476+
"int128",
477+
"ubigint",
478+
"uhugeint",
479+
"usmallint",
480+
"utinyint",
481+
}
482+
_NUMERIC_TYPES = {"float", "real", "double", "decimal", "numeric", "dec"}
483+
_BOOLEAN_TYPES = {"boolean", "bool", "logical"}
484+
_STRING_TYPES = {
485+
"varchar",
486+
"char",
487+
"bpchar",
488+
"text",
489+
"string",
490+
"blob",
491+
"uuid",
492+
"guid",
493+
"nvarchar",
494+
}
495+
_TIME_TYPES = {"time", "time with time zone", "timetz"}
496+
_DATETIME_TYPES = {"datetime", "interval"}
497+
_BINARY_TYPES = {"bit", "bitstring", "binary", "varbinary", "bytea"}
498+
_UNKNOWN_TYPES = {
499+
"row",
500+
"geometry",
501+
# Null type (can occur when attaching databases or with unknown column types)
502+
"null",
503+
'"null"',
504+
}
505+
506+
456507
def _db_type_to_data_type(db_type: str) -> DataType:
457508
"""Convert a DuckDB type to a Marimo data type.
458509
Reference: https://duckdb.org/docs/stable/sql/data_types/overview
510+
Latest types: https://github.com/marimo-team/codemirror-sql/blob/caa7c664135988b634f55a3e57a1327a5ffeede2/src/dialects/duckdb/duckdb.ts
459511
"""
460512
db_type = db_type.lower()
461-
# Numeric types
462-
if db_type in [
463-
"tinyint",
464-
"smallint",
465-
"integer",
466-
"bigint",
467-
"hugeint",
468-
"utinyint",
469-
"usmallint",
470-
"uinteger",
471-
"ubigint",
472-
"uhugeint",
473-
]:
513+
514+
# Check for exact matches first, then patterns
515+
516+
if db_type in _INTEGER_TYPES or db_type.startswith("uint"):
474517
return "integer"
518+
475519
if (
476-
db_type
477-
in [
478-
"float",
479-
"real",
480-
"double",
481-
"decimal",
482-
"numeric",
483-
]
520+
db_type in _NUMERIC_TYPES
484521
or db_type.startswith("decimal")
485522
or db_type.startswith("float")
486523
):
487524
return "number"
488-
# Boolean type
489-
if db_type == "boolean":
525+
526+
if db_type in _BOOLEAN_TYPES:
490527
return "boolean"
491-
# String types
492-
if db_type in [
493-
"varchar",
494-
"char",
495-
"bpchar",
496-
"text",
497-
"string",
498-
"blob",
499-
"uuid",
500-
]:
528+
529+
if db_type in _STRING_TYPES:
501530
return "string"
502-
# Date and Time types
531+
503532
if db_type == "date":
504533
return "date"
505-
if db_type in ["time", "time with time zone"]:
534+
if db_type in _TIME_TYPES:
506535
return "time"
507-
if db_type in ["datetime", "interval"] or db_type.startswith("timestamp"):
536+
if db_type in _DATETIME_TYPES or db_type.startswith("timestamp"):
508537
return "datetime"
538+
539+
# Binary types (represented as string)
540+
if db_type in _BINARY_TYPES:
541+
return "string"
542+
543+
# Enum types (represented as string)
544+
if db_type == "enum" or db_type.startswith("enum"):
545+
return "string"
546+
509547
# Nested types
510-
if "[" in db_type and "]" in db_type:
511-
return "unknown"
512548
if (
513549
db_type.startswith("union")
514550
or db_type.startswith("map")
515551
or db_type.startswith("struct")
516552
or db_type.startswith("list")
517553
or db_type.startswith("array")
518554
or db_type.startswith("json")
555+
or ("[" in db_type and "]" in db_type)
519556
):
520557
return "unknown"
521-
# Special types
522-
if db_type == "bit":
523-
return "string" # Representing bit as string
524-
if db_type == "enum" or db_type.startswith("enum"):
525-
return "string" # Representing enum as string
526-
# Geometry types
527-
if db_type == "geometry":
528-
return "unknown"
529-
# Null type (can occur when attaching databases or with unknown column types)
530-
if db_type == "null" or db_type == '"null"':
558+
559+
# Other special types
560+
if db_type in _UNKNOWN_TYPES:
531561
return "unknown"
532562

533563
LOGGER.warning("Unknown DuckDB type: %s", db_type)
534-
# Unknown type
535564
return "unknown"

tests/_data/test_get_datasets.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -628,23 +628,54 @@ def test_db_type_to_data_type_various() -> None:
628628
# Integer types
629629
assert _db_type_to_data_type("integer") == "integer"
630630
assert _db_type_to_data_type("bigint") == "integer"
631+
assert _db_type_to_data_type("int128") == "integer"
632+
assert _db_type_to_data_type("integral") == "integer"
633+
assert _db_type_to_data_type("long") == "integer"
634+
assert _db_type_to_data_type("short") == "integer"
635+
assert _db_type_to_data_type("signed") == "integer"
636+
assert _db_type_to_data_type("oid") == "integer"
637+
assert _db_type_to_data_type("varint") == "integer"
638+
639+
# Unsigned integers
640+
assert _db_type_to_data_type("utinyint") == "integer"
641+
assert _db_type_to_data_type("usmallint") == "integer"
642+
assert _db_type_to_data_type("uinteger") == "integer"
643+
assert _db_type_to_data_type("ubigint") == "integer"
644+
assert _db_type_to_data_type("uhugeint") == "integer"
645+
assert _db_type_to_data_type("uint128") == "integer"
631646

632647
# Numeric types
633648
assert _db_type_to_data_type("float") == "number"
634649
assert _db_type_to_data_type("double") == "number"
650+
assert _db_type_to_data_type("float4") == "number"
651+
assert _db_type_to_data_type("dec") == "number"
652+
assert _db_type_to_data_type("decimal") == "number"
635653

636654
# String types
637655
assert _db_type_to_data_type("varchar") == "string"
638656
assert _db_type_to_data_type("text") == "string"
657+
assert _db_type_to_data_type("blob") == "string"
658+
assert _db_type_to_data_type("guid") == "string"
659+
assert _db_type_to_data_type("nvarchar") == "string"
660+
661+
# Binary types (represented as string)
662+
assert _db_type_to_data_type("binary") == "string"
663+
assert _db_type_to_data_type("varbinary") == "string"
664+
assert _db_type_to_data_type("bytea") == "string"
639665

640666
# Boolean
641667
assert _db_type_to_data_type("boolean") == "boolean"
668+
assert _db_type_to_data_type("bool") == "boolean"
669+
assert _db_type_to_data_type("logical") == "boolean"
642670

643671
# Date/Time
644672
assert _db_type_to_data_type("date") == "date"
645673
assert _db_type_to_data_type("timestamp") == "datetime"
674+
assert _db_type_to_data_type("timestamptz") == "datetime"
675+
assert _db_type_to_data_type("timetz") == "time"
646676

647677
# Special types
648678
assert _db_type_to_data_type("geometry") == "unknown"
649679
assert _db_type_to_data_type("null") == "unknown"
650680
assert _db_type_to_data_type("json") == "unknown"
681+
assert _db_type_to_data_type("row") == "unknown"

0 commit comments

Comments
 (0)