Skip to content

Commit

Permalink
[SPARK-47684][SQL] Postgres: Map length unspecified bpchar to StringType
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

This PR maps length unspecified bpchar to StringType for Postgres. Length unspecified bpchar represents variable unlimited character string, and the PG JDBC reports its size with Int.MaxValue. W/o this patch, we will always do char padding to 2147483647. This is inefficient and risky.

### Why are the changes needed?

suitable type mapping

### Does this PR introduce _any_ user-facing change?

yes, length unspecified bpchar is mapped to StringType, before, CharType(2147483647)

### How was this patch tested?

new tests

### Was this patch authored or co-authored using generative AI tooling?
no

Closes apache#45810 from yaooqinn/SPARK-47684.

Authored-by: Kent Yao <[email protected]>
Signed-off-by: Kent Yao <[email protected]>
  • Loading branch information
yaooqinn committed Apr 2, 2024
1 parent e2e6e09 commit 22771a6
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -122,18 +122,19 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite {
).executeUpdate()

conn.prepareStatement("CREATE TABLE char_types (" +
"c0 char(4), c1 character(4), c2 character varying(4), c3 varchar(4), c4 bpchar(1))"
"c0 char(4), c1 character(4), c2 character varying(4), c3 varchar(4), c4 bpchar(1)," +
"c5 bpchar, c6 char)"
).executeUpdate()
conn.prepareStatement("INSERT INTO char_types VALUES " +
"('abcd', 'efgh', 'ijkl', 'mnop', 'q')").executeUpdate()
"('abcd', 'efgh', 'ijkl', 'mnop', 'q', 'eason', 'c' )").executeUpdate()

// SPARK-42916: character/char/bpchar w/o length specifier defaults to int max value, this will
// cause OOM as it will be padded with ' ' to 2147483647.
conn.prepareStatement("CREATE TABLE char_array_types (" +
"c0 char(4)[], c1 character(4)[], c2 character varying(4)[], c3 varchar(4)[], c4 bpchar(1)[])"
"c0 char(4)[], c1 character(4)[], c2 character varying(4)[], c3 varchar(4)[]," +
"c4 bpchar(1)[], c5 bpchar[])"
).executeUpdate()
conn.prepareStatement("INSERT INTO char_array_types VALUES " +
"""('{"a", "bcd"}', '{"ef", "gh"}', '{"i", "j", "kl"}', '{"mnop"}', '{"q", "r"}')"""
"""('{"a", "bcd"}', '{"ef", "gh"}', '{"i", "j", "kl"}', '{"mnop"}', '{"q", "r"}',
| '{"Eason", "Ethan"}')""".stripMargin
).executeUpdate()

conn.prepareStatement("CREATE TABLE money_types (" +
Expand Down Expand Up @@ -184,7 +185,7 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite {
assert(rows.length == 2)
// Test the types, and values using the first row.
val types = rows(0).toSeq.map(x => x.getClass)
assert(types.length == 42)
assert(types.length == 45)
assert(classOf[String].isAssignableFrom(types(0)))
assert(classOf[java.lang.Integer].isAssignableFrom(types(1)))
assert(classOf[java.lang.Double].isAssignableFrom(types(2)))
Expand Down Expand Up @@ -387,26 +388,13 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite {

test("character type tests") {
val df = sqlContext.read.jdbc(jdbcUrl, "char_types", new Properties)
val row = df.collect()
assert(row.length == 1)
assert(row(0).length === 5)
assert(row(0).getString(0) === "abcd")
assert(row(0).getString(1) === "efgh")
assert(row(0).getString(2) === "ijkl")
assert(row(0).getString(3) === "mnop")
assert(row(0).getString(4) === "q")
checkAnswer(df, Row("abcd", "efgh", "ijkl", "mnop", "q", "eason", "c"))
}

test("SPARK-32576: character array type tests") {
val df = sqlContext.read.jdbc(jdbcUrl, "char_array_types", new Properties)
val row = df.collect()
assert(row.length == 1)
assert(row(0).length === 5)
assert(row(0).getSeq[String](0) === Seq("a ", "bcd "))
assert(row(0).getSeq[String](1) === Seq("ef ", "gh "))
assert(row(0).getSeq[String](2) === Seq("i", "j", "kl"))
assert(row(0).getSeq[String](3) === Seq("mnop"))
assert(row(0).getSeq[String](4) === Seq("q", "r"))
checkAnswer(df, Row(Seq("a ", "bcd "), Seq("ef ", "gh "), Seq("i", "j", "kl"),
Seq("mnop"), Seq("q", "r"), Seq("Eason", "Ethan")))
}

test("SPARK-34333: money type tests") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ private case class PostgresDialect() extends JdbcDialect with SQLConfHelper {
// timetz represents time with time zone, currently it maps to Types.TIME.
// We need to change to Types.TIME_WITH_TIMEZONE if the upstream changes.
Some(TimestampType)
case Types.CHAR if "bpchar".equalsIgnoreCase(typeName) && size == Int.MaxValue =>
// bpchar with unspecified length same as text in postgres with blank-trimmed
Some(StringType)
case Types.OTHER if "void".equalsIgnoreCase(typeName) => Some(NullType)
case Types.OTHER => Some(StringType)
case _ if "text".equalsIgnoreCase(typeName) => Some(StringType) // sqlType is Types.VARCHAR
Expand All @@ -91,6 +94,7 @@ private case class PostgresDialect() extends JdbcDialect with SQLConfHelper {
case "float4" => Some(FloatType)
case "float8" => Some(DoubleType)
case "varchar" => Some(VarcharType(precision))
case "bpchar" if precision == Int.MaxValue => Some(StringType)
case "char" | "bpchar" => Some(CharType(precision))
case "text" | "cidr" | "inet" | "json" | "jsonb" | "uuid" |
"xml" | "tsvector" | "tsquery" | "macaddr" | "macaddr8" | "txid_snapshot" | "point" |
Expand Down

0 comments on commit 22771a6

Please sign in to comment.