From 9b57a5802bc995f5892b2ed05e10ff88272682f8 Mon Sep 17 00:00:00 2001 From: Eduardo Ruiz Date: Mon, 3 Jan 2022 09:33:28 +0100 Subject: [PATCH 1/2] doc: Added some links to spark functions --- .../scala/doric/syntax/BinaryColumns.scala | 6 ++ .../scala/doric/syntax/BooleanColumns.scala | 2 + .../scala/doric/syntax/CommonColumns.scala | 6 ++ .../main/scala/doric/syntax/DateColumns.scala | 50 +++++++++++----- .../scala/doric/syntax/StringColumns.scala | 59 ++++++++++++++++--- .../scala/doric/syntax/TimestampColumns.scala | 8 +++ .../doric/syntax/StringColumnsSpec.scala | 4 +- 7 files changed, 110 insertions(+), 25 deletions(-) diff --git a/core/src/main/scala/doric/syntax/BinaryColumns.scala b/core/src/main/scala/doric/syntax/BinaryColumns.scala index f6e64633d..bb5440c25 100644 --- a/core/src/main/scala/doric/syntax/BinaryColumns.scala +++ b/core/src/main/scala/doric/syntax/BinaryColumns.scala @@ -19,6 +19,7 @@ private[syntax] trait BinaryColumns { * as a 32 character hex string. * * @group Binary Type + * @see [[org.apache.spark.sql.functions.md5]] */ def md5: StringColumn = column.elem.map(f.md5).toDC @@ -27,6 +28,7 @@ private[syntax] trait BinaryColumns { * as a 40 character hex string. * * @group Binary Type + * @see [[org.apache.spark.sql.functions.sha1]] */ def sha1: StringColumn = column.elem.map(f.sha1).toDC @@ -36,6 +38,7 @@ private[syntax] trait BinaryColumns { * * @throws java.lang.IllegalArgumentException if numBits is not in the permitted values * @group Binary Type + * @see [[org.apache.spark.sql.functions.sha2]] */ def sha2(numBits: Int): StringColumn = column.elem.map(x => f.sha2(x, numBits)).toDC @@ -45,6 +48,7 @@ private[syntax] trait BinaryColumns { * returns the value as a long column. * * @group Binary Type + * @see [[org.apache.spark.sql.functions.crc32]] */ def crc32: LongColumn = column.elem.map(f.crc32).toDC @@ -53,6 +57,7 @@ private[syntax] trait BinaryColumns { * This is the reverse of unbase64. * * @group Binary Type + * @see [[org.apache.spark.sql.functions.base64]] */ def base64: StringColumn = column.elem.map(f.base64).toDC @@ -62,6 +67,7 @@ private[syntax] trait BinaryColumns { * If either argument is null, the result will also be null. * * @group Binary Type + * @see [[org.apache.spark.sql.functions.decode]] */ def decode(charset: StringColumn): StringColumn = (column.elem, charset.elem) diff --git a/core/src/main/scala/doric/syntax/BooleanColumns.scala b/core/src/main/scala/doric/syntax/BooleanColumns.scala index 51c0814b9..844a8af09 100644 --- a/core/src/main/scala/doric/syntax/BooleanColumns.scala +++ b/core/src/main/scala/doric/syntax/BooleanColumns.scala @@ -67,6 +67,7 @@ private[syntax] trait BooleanColumns { * * @throws java.lang.RuntimeException if the condition is false * @group Boolean Type + * @see [[org.apache.spark.sql.functions.assert_true]] */ def assertTrue: NullColumn = column.elem.map(f.assert_true).toDC @@ -75,6 +76,7 @@ private[syntax] trait BooleanColumns { * * @throws java.lang.RuntimeException if the condition is false * @group Boolean Type + * @see [[org.apache.spark.sql.functions.assert_true]] */ def assertTrue(msg: StringColumn): NullColumn = (column.elem, msg.elem).mapN(f.assert_true).toDC diff --git a/core/src/main/scala/doric/syntax/CommonColumns.scala b/core/src/main/scala/doric/syntax/CommonColumns.scala index 887e2a266..89fa1b824 100644 --- a/core/src/main/scala/doric/syntax/CommonColumns.scala +++ b/core/src/main/scala/doric/syntax/CommonColumns.scala @@ -31,6 +31,7 @@ private[syntax] trait CommonColumns extends ColGetters[NamedDoricColumn] { * the DoricColumns to coalesce * @return * the first column that is not null, or null if all inputs are null. + * @see [[org.apache.spark.sql.functions.coalesce]] */ def coalesce[T](cols: DoricColumn[T]*): DoricColumn[T] = cols.map(_.elem).toList.sequence.map(f.coalesce(_: _*)).toDC @@ -39,6 +40,7 @@ private[syntax] trait CommonColumns extends ColGetters[NamedDoricColumn] { * Calculates the hash code of given columns, and returns the result as an integer column. * * @group All Types + * @see [[org.apache.spark.sql.functions.hash]] */ def hash(cols: DoricColumn[_]*): IntegerColumn = cols.map(_.elem).toList.sequence.map(f.hash(_: _*)).toDC @@ -48,6 +50,7 @@ private[syntax] trait CommonColumns extends ColGetters[NamedDoricColumn] { * variant of the xxHash algorithm, and returns the result as a long column. * * @group All Types + * @see [[org.apache.spark.sql.functions.xxhash64]] */ def xxhash64(cols: DoricColumn[_]*): LongColumn = cols.map(_.elem).toList.sequence.map(f.xxhash64(_: _*)).toDC @@ -181,6 +184,7 @@ private[syntax] trait CommonColumns extends ColGetters[NamedDoricColumn] { * literals to compare to * @return * Boolean DoricColumn with the comparation logic. + * @see [[org.apache.spark.sql.Column.isin]] */ def isIn(elems: T*): BooleanColumn = column.elem.map(_.isin(elems: _*)).toDC @@ -189,6 +193,7 @@ private[syntax] trait CommonColumns extends ColGetters[NamedDoricColumn] { * @group All Types * @return * Boolean DoricColumn + * @see [[org.apache.spark.sql.Column.isNull]] */ def isNull: BooleanColumn = column.elem.map(_.isNull).toDC @@ -197,6 +202,7 @@ private[syntax] trait CommonColumns extends ColGetters[NamedDoricColumn] { * @group All Types * @return * Boolean DoricColumn + * @see [[org.apache.spark.sql.Column.isNotNull]] */ def isNotNull: BooleanColumn = column.elem.map(_.isNotNull).toDC diff --git a/core/src/main/scala/doric/syntax/DateColumns.scala b/core/src/main/scala/doric/syntax/DateColumns.scala index 4b21001ff..aaf122e84 100644 --- a/core/src/main/scala/doric/syntax/DateColumns.scala +++ b/core/src/main/scala/doric/syntax/DateColumns.scala @@ -15,6 +15,7 @@ private[syntax] trait DateColumns { * All calls of current_date within the same query return the same value. * * @group Date Type + * @see [[org.apache.spark.sql.functions.current_date]] */ def currentDate(): DateColumn = f.current_date().asDoric[Date] @@ -32,6 +33,7 @@ private[syntax] trait DateColumns { * Date column after adding months * @note * Timestamp columns will be truncated to Date column + * @see [[org.apache.spark.sql.functions.add_months]] */ def addMonths(nMonths: IntegerColumn): DateColumn = (column.elem, nMonths.elem).mapN(f.add_months).toDC @@ -44,6 +46,7 @@ private[syntax] trait DateColumns { * @note * Timestamp columns will be truncated to Date column * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.date_add]] */ def addDays(days: IntegerColumn): DateColumn = (column.elem, days.elem).mapN(f.date_add).toDC @@ -59,6 +62,7 @@ private[syntax] trait DateColumns { * Use specialized functions like 'year' whenever possible as they benefit from a * specialized implementation. * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.date_format]] */ def format(format: StringColumn): StringColumn = (column.elem, format.elem) @@ -75,6 +79,7 @@ private[syntax] trait DateColumns { * @note * Timestamp columns will be truncated to Date column * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.date_sub]] */ def subDays(days: IntegerColumn): DateColumn = (column.elem, days.elem).mapN(f.date_sub).toDC @@ -85,6 +90,7 @@ private[syntax] trait DateColumns { * @param dateCol * A Date or Timestamp column * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.datediff]] */ def diff(dateCol: DoricColumn[T]): IntegerColumn = (column.elem, dateCol.elem) @@ -95,6 +101,7 @@ private[syntax] trait DateColumns { * Extracts the day of the month as an integer from a given date. * * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.dayofmonth]] */ def dayOfMonth: IntegerColumn = column.elem.map(f.dayofmonth).toDC @@ -103,6 +110,7 @@ private[syntax] trait DateColumns { * Ranges from 1 for a Sunday through to 7 for a Saturday * * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.dayofweek]] */ def dayOfWeek: IntegerColumn = column.elem.map(f.dayofweek).toDC @@ -110,6 +118,7 @@ private[syntax] trait DateColumns { * Extracts the day of the year as an integer from a given date. * * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.dayofyear]] */ def dayOfYear: IntegerColumn = column.elem.map(f.dayofyear).toDC @@ -117,6 +126,7 @@ private[syntax] trait DateColumns { * Sets the moment to the last day of the same month. * * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.last_day]] */ def endOfMonth: DateColumn = lastDayOfMonth @@ -126,6 +136,7 @@ private[syntax] trait DateColumns { * month in July 2015. * * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.last_day]] */ def lastDayOfMonth: DateColumn = column.elem.map(f.last_day).toDC @@ -133,6 +144,7 @@ private[syntax] trait DateColumns { * Extracts the month as an integer from a given date. * * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.month]] */ def month: IntegerColumn = column.elem.map(f.month).toDC @@ -143,7 +155,7 @@ private[syntax] trait DateColumns { * of their respective months. Otherwise, the difference is calculated assuming 31 days per month. * * For example: - * {{{ + * @example {{{ * Date("2017-11-14").monthsBetween(Date("2017-07-14")) // returns 4.0 * Date("2017-01-01").monthsBetween(Date("2017-01-10")) // returns 0.29032258 * Timestamp("2017-06-01 00:00:00").monthsBetween(Timestamp("2017-06-16 12:00:00")) // returns -0.5 @@ -152,6 +164,7 @@ private[syntax] trait DateColumns { * @param dateCol * Date or Timestamp column * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.months_between]] */ def monthsBetween(dateCol: DoricColumn[T]): DoubleColumn = (column.elem, dateCol.elem).mapN(f.months_between).toDC @@ -165,6 +178,7 @@ private[syntax] trait DateColumns { * If `roundOff` is set to true, the result is rounded off to 8 digits; * it is not rounded otherwise. * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.months_between]] */ def monthsBetween( dateCol: DoricColumn[T], @@ -180,14 +194,15 @@ private[syntax] trait DateColumns { * Returns the first date which is later than the value of the `date` column that is on the * specified day of the week. * - * For example, `Date("2015-07-27").nextDay("Sunday")` returns Date("2015-08-02") because - * that is the first Sunday after 2015-07-27. + * @example For example, `Date("2015-07-27").nextDay("Sunday")` returns Date("2015-08-02") + * because that is the first Sunday after 2015-07-27. * * @param dayOfWeek * Case insensitive, and accepts: "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" * @note * Timestamp columns will be truncated to Date column * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.next_day]] */ def nextDay(dayOfWeek: StringColumn): DateColumn = (column.elem, dayOfWeek.elem) @@ -200,28 +215,30 @@ private[syntax] trait DateColumns { * Extracts the quarter as an integer from a given date. * * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.quarter]] */ def quarter: IntegerColumn = column.elem.map(f.quarter).toDC /** * Returns date truncated to the unit specified by the format. * - * For example, `Timestamp("2018-11-19 12:01:19").trunc("year")` returns Date("2018-01-01") + * @example For example, `Timestamp("2018-11-19 12:01:19").trunc("year")` returns Date("2018-01-01") * * @param format - * if date: - * * 'year', 'yyyy', 'yy' to truncate by year, - * * 'month', 'mon', 'mm' to truncate by month - * Other options are: 'week', 'quarter' - * if timestamp: - * * 'year', 'yyyy', 'yy' to truncate by year, - * * 'month', 'mon', 'mm' to truncate by month, - * * 'day', 'dd' to truncate by day, - * Other options are: - * * 'microsecond', 'millisecond', 'second', 'minute', 'hour', 'week', 'quarter' + * - if date: + * - 'year', 'yyyy', 'yy' to truncate by year, + * - 'month', 'mon', 'mm' to truncate by month + * - __Other options are__: 'week', 'quarter' + * - if timestamp: + * - 'year', 'yyyy', 'yy' to truncate by year, + * - 'month', 'mon', 'mm' to truncate by month, + * - 'day', 'dd' to truncate by day, + * - __Other options are__: 'microsecond', 'millisecond', 'second', 'minute', 'hour', 'week', 'quarter' * @note * Timestamp columns will be truncated to Date column * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.trunc]] + * @see [[org.apache.spark.sql.functions.date_trunc]] */ def truncate(format: StringColumn): DoricColumn[T] = (column.elem, format.elem) @@ -243,6 +260,7 @@ private[syntax] trait DateColumns { * A long * * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.unix_timestamp]] */ def unixTimestamp: LongColumn = column.elem.map(f.unix_timestamp).toDC @@ -253,6 +271,7 @@ private[syntax] trait DateColumns { * as defined by ISO 8601 * * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.weekofyear]] */ def weekOfYear: IntegerColumn = column.elem.map(f.weekofyear).toDC @@ -260,6 +279,7 @@ private[syntax] trait DateColumns { * Extracts the year as an integer from a given date. * * @group Date & Timestamp Type + * @see [[org.apache.spark.sql.functions.year]] */ def year: IntegerColumn = column.elem.map(f.year).toDC @@ -267,6 +287,7 @@ private[syntax] trait DateColumns { * Transform date to timestamp * * @group Date Type + * @see [[org.apache.spark.sql.functions.to_timestamp]] */ def toTimestamp: TimestampColumn = column.elem.map(f.to_timestamp).toDC @@ -274,6 +295,7 @@ private[syntax] trait DateColumns { * Transform date to Instant * * @group Date Type + * @see [[org.apache.spark.sql.functions.to_timestamp]] */ def toInstant: InstantColumn = column.elem.map(f.to_timestamp).toDC } diff --git a/core/src/main/scala/doric/syntax/StringColumns.scala b/core/src/main/scala/doric/syntax/StringColumns.scala index ab6d98004..aea4b917d 100644 --- a/core/src/main/scala/doric/syntax/StringColumns.scala +++ b/core/src/main/scala/doric/syntax/StringColumns.scala @@ -17,6 +17,7 @@ private[syntax] trait StringColumns { * @return * a reference of a single DoricColumn with all strings concatenated. If at * least one is null will return null. + * @see [[org.apache.spark.sql.functions.concat]] */ def concat(cols: StringColumn*): StringColumn = cols.map(_.elem).toList.sequence.map(f.concat(_: _*)).toDC @@ -63,6 +64,7 @@ private[syntax] trait StringColumns { * @return * Formats the arguments in printf-style and returns the result as a string * column. + * @see [[org.apache.spark.sql.functions.format_string]] */ def formatString( format: StringColumn, @@ -109,6 +111,7 @@ private[syntax] trait StringColumns { * and returns the result as an int column. * * @group String Type + * @see [[org.apache.spark.sql.functions.ascii]] */ def ascii: IntegerColumn = s.elem.map(f.ascii).toDC @@ -116,11 +119,12 @@ private[syntax] trait StringColumns { * Returns a new string column by converting the first letter of each word * to uppercase. Words are delimited by whitespace. * - * For example, "hello world" will become "Hello World". + * @example For example, "hello world" will become "Hello World". * * @group String Type + * @see [[org.apache.spark.sql.functions.initcap]] */ - def initcap: StringColumn = s.elem.map(f.initcap).toDC + def initCap: StringColumn = s.elem.map(f.initcap).toDC /** * Locate the position of the first occurrence of substr column in the @@ -130,8 +134,9 @@ private[syntax] trait StringColumns { * @note * The position is not zero based, but 1 based index. Returns 0 if substr * could not be found in str. + * @see [[org.apache.spark.sql.functions.instr]] */ - def instr(substring: StringColumn): IntegerColumn = + def inStr(substring: StringColumn): IntegerColumn = (s.elem, substring.elem) .mapN((str, substr) => { new Column(StringInstr(str.expr, substr.expr)) @@ -144,6 +149,7 @@ private[syntax] trait StringColumns { * spaces. The length of binary strings includes binary zeros. * * @group String Type + * @see [[org.apache.spark.sql.functions.length]] */ def length: IntegerColumn = s.elem.map(f.length).toDC @@ -151,6 +157,7 @@ private[syntax] trait StringColumns { * Computes the Levenshtein distance of the two given string columns. * * @group String Type + * @see [[org.apache.spark.sql.functions.levenshtein]] */ def levenshtein(dc: StringColumn): IntegerColumn = (s.elem, dc.elem).mapN(f.levenshtein).toDC @@ -163,6 +170,7 @@ private[syntax] trait StringColumns { * @note * The position is not zero based, but 1 based index. returns 0 if substr * could not be found in str. + * @see [[org.apache.spark.sql.functions.locate]] */ def locate( substr: StringColumn, @@ -178,6 +186,7 @@ private[syntax] trait StringColumns { * Converts a string column to lower case. * * @group String Type + * @see [[org.apache.spark.sql.functions.lower]] */ def lower: StringColumn = s.elem.map(f.lower).toDC @@ -187,6 +196,7 @@ private[syntax] trait StringColumns { * characters. * * @group String Type + * @see [[org.apache.spark.sql.functions.lpad]] */ def lpad(len: IntegerColumn, pad: StringColumn): StringColumn = (s.elem, len.elem, pad.elem) @@ -199,6 +209,7 @@ private[syntax] trait StringColumns { * Trim the spaces from left end for the specified string value. * * @group String Type + * @see [[org.apache.spark.sql.functions.ltrim]] */ def ltrim: StringColumn = s.elem.map(f.ltrim).toDC @@ -207,6 +218,7 @@ private[syntax] trait StringColumns { * string column. * * @group String Type + * @see [[org.apache.spark.sql.functions.ltrim]] */ def ltrim(trimString: StringColumn): StringColumn = (s.elem, trimString.elem) @@ -220,6 +232,7 @@ private[syntax] trait StringColumns { * byte position `pos` of `src` and proceeding for `len` bytes. * * @group String Type + * @see [[org.apache.spark.sql.functions.overlay]] */ def overlay( replace: StringColumn, @@ -237,6 +250,7 @@ private[syntax] trait StringColumns { * * @throws java.lang.IllegalArgumentException if the specified group index exceeds the group count of regex * @group String Type + * @see [[org.apache.spark.sql.functions.regexp_extract]] */ def regexpExtract( exp: StringColumn, @@ -253,6 +267,7 @@ private[syntax] trait StringColumns { * with replacement. * * @group String Type + * @see [[org.apache.spark.sql.functions.regexp_replace]] */ def regexpReplace( pattern: StringColumn, @@ -264,6 +279,7 @@ private[syntax] trait StringColumns { * Repeats a string column n times, and returns it as a new string column. * * @group String Type + * @see [[org.apache.spark.sql.functions.repeat]] */ def repeat(n: IntegerColumn): StringColumn = (s.elem, n.elem) .mapN((str, times) => new Column(StringRepeat(str.expr, times.expr))) @@ -275,6 +291,7 @@ private[syntax] trait StringColumns { * characters. * * @group String Type + * @see [[org.apache.spark.sql.functions.rpad]] */ def rpad(len: IntegerColumn, pad: StringColumn): StringColumn = (s.elem, len.elem, pad.elem) @@ -285,6 +302,7 @@ private[syntax] trait StringColumns { * Trim the spaces from right end for the specified string value. * * @group String Type + * @see [[org.apache.spark.sql.functions.rtrim]] */ def rtrim: StringColumn = s.elem.map(f.rtrim).toDC @@ -293,6 +311,7 @@ private[syntax] trait StringColumns { * string column. * * @group String Type + * @see [[org.apache.spark.sql.functions.rtrim]] */ def rtrim(trimString: StringColumn): StringColumn = (s.elem, trimString.elem) @@ -303,6 +322,7 @@ private[syntax] trait StringColumns { * Returns the soundex code for the specified expression. * * @group String Type + * @see [[org.apache.spark.sql.functions.soundex]] */ def soundex: StringColumn = s.elem.map(f.soundex).toDC @@ -314,6 +334,7 @@ private[syntax] trait StringColumns { * a Java regular expression. * * @group String Type + * @see [[org.apache.spark.sql.functions.split]] */ def split(pattern: StringColumn): ArrayColumn[String] = split(pattern, (-1).lit) @@ -326,12 +347,12 @@ private[syntax] trait StringColumns { * a string representing a regular expression. The regex string should be * a Java regular expression. * @param limit - * an integer expression which controls the number of times the regex is - * applied. + * an integer expression which controls the number of times the regex is applied. + * - __limit greater than 0__: The resulting array's length + * will not be more than limit, and the resulting array's last entry will + * contain all input beyond the last matched regex. + * - __limit less than or equal to 0__: `regex` will be applied as many times as possible, + * and the resulting array can be of any size. */ def split( pattern: StringColumn, @@ -349,6 +370,7 @@ private[syntax] trait StringColumns { * @group String Type * @note * The position is not zero based, but 1 based index. + * @see [[org.apache.spark.sql.functions.substring]] */ def substring(pos: IntegerColumn, len: IntegerColumn): StringColumn = (s.elem, pos.elem, len.elem) @@ -364,6 +386,7 @@ private[syntax] trait StringColumns { * for delim. * * @group String Type + * @see [[org.apache.spark.sql.functions.substring_index]] */ def substringIndex( delim: StringColumn, @@ -382,6 +405,7 @@ private[syntax] trait StringColumns { * string matches the character in the `matchingString`. * * @group String Type + * @see [[org.apache.spark.sql.functions.translate]] */ def translate( matchingString: StringColumn, @@ -397,6 +421,7 @@ private[syntax] trait StringColumns { * Trim the spaces from both ends for the specified string column. * * @group String Type + * @see [[org.apache.spark.sql.functions.trim]] */ def trim: StringColumn = s.elem.map(f.trim).toDC @@ -405,6 +430,7 @@ private[syntax] trait StringColumns { * column (literal). * * @group String Type + * @see [[org.apache.spark.sql.functions.trim]] */ def trim(trimString: StringColumn): StringColumn = (s.elem, trimString.elem) @@ -417,6 +443,7 @@ private[syntax] trait StringColumns { * Converts a string column to upper case. * * @group String Type + * @see [[org.apache.spark.sql.functions.upper]] */ def upper: StringColumn = s.elem.map(f.upper).toDC @@ -431,6 +458,7 @@ private[syntax] trait StringColumns { * match. * * @group String Type + * @see [[org.apache.spark.sql.Column.contains]] */ def contains(dc: StringColumn): BooleanColumn = (s.elem, dc.elem).mapN(_.contains(_)).toDC @@ -439,6 +467,7 @@ private[syntax] trait StringColumns { * String ends with. Returns a boolean column based on a string match. * * @group String Type + * @see [[org.apache.spark.sql.Column.endsWith]] */ def endsWith(dc: StringColumn): BooleanColumn = (s.elem, dc.elem).mapN(_.endsWith(_)).toDC @@ -447,6 +476,7 @@ private[syntax] trait StringColumns { * SQL like expression. Returns a boolean column based on a SQL LIKE match. * * @group String Type + * @see [[org.apache.spark.sql.Column.like]] */ def like(literal: StringColumn): BooleanColumn = (s.elem, literal.elem) @@ -458,6 +488,7 @@ private[syntax] trait StringColumns { * on a regex match. * * @group String Type + * @see [[org.apache.spark.sql.Column.rlike]] */ def rLike(literal: StringColumn): BooleanColumn = (s.elem, literal.elem) @@ -468,6 +499,7 @@ private[syntax] trait StringColumns { * String starts with. Returns a boolean column based on a string match. * * @group String Type + * @see [[org.apache.spark.sql.Column.startsWith]] */ def startsWith(dc: StringColumn): BooleanColumn = (s.elem, dc.elem).mapN(_.startsWith(_)).toDC @@ -479,6 +511,7 @@ private[syntax] trait StringColumns { * on a regex match. * * @group String Type + * @see [[org.apache.spark.sql.Column.rlike]] */ def matchRegex(literal: StringColumn): BooleanColumn = rLike(literal) @@ -488,6 +521,7 @@ private[syntax] trait StringColumns { * If either argument is null, the result will also be null. * * @group String Type + * @see [[org.apache.spark.sql.functions.encode]] */ def encode(charset: StringColumn): BinaryColumn = (s.elem, charset.elem) @@ -501,6 +535,7 @@ private[syntax] trait StringColumns { * This is the reverse of base64. * * @group String Type + * @see [[org.apache.spark.sql.functions.unbase64]] */ def unbase64: BinaryColumn = s.elem.map(f.unbase64).toDC @@ -512,6 +547,7 @@ private[syntax] trait StringColumns { * A long * * @group String Type + * @see [[org.apache.spark.sql.functions.unix_timestamp]] */ def unixTimestamp: LongColumn = s.elem.map(f.unix_timestamp).toDC @@ -523,6 +559,7 @@ private[syntax] trait StringColumns { * @throws java.lang.IllegalArgumentException if invalid pattern * * @group String Type + * @see [[org.apache.spark.sql.functions.unix_timestamp]] */ def unixTimestamp(pattern: StringColumn): LongColumn = (s.elem, pattern.elem) @@ -541,6 +578,7 @@ private[syntax] trait StringColumns { * Similar to concat doric function, but only with two columns * * @group String Type + * @see [[org.apache.spark.sql.functions.concat]] */ def +(s2: StringColumn): StringColumn = concat(s, s2) @@ -557,6 +595,7 @@ private[syntax] trait StringColumns { * @return * A date, or null if `e` was a string that could not be cast to a date * or `format` was an invalid format + * @see [[org.apache.spark.sql.functions.to_date]] */ def toDate(format: StringColumn): LocalDateColumn = (s.elem, format.elem) @@ -578,6 +617,7 @@ private[syntax] trait StringColumns { * @return * A timestamp, or null if `s` was a string that could not be cast to a * timestamp or `format` was an invalid format + * @see [[org.apache.spark.sql.functions.to_timestamp]] */ def toTimestamp(format: StringColumn): InstantColumn = (s.elem, format.elem) @@ -597,6 +637,7 @@ private[syntax] trait StringColumns { * * @throws java.lang.RuntimeException with the error message * @group String Type + * @see [[org.apache.spark.sql.functions.raise_error]] */ def raiseError: NullColumn = s.elem.map(f.raise_error).toDC } diff --git a/core/src/main/scala/doric/syntax/TimestampColumns.scala b/core/src/main/scala/doric/syntax/TimestampColumns.scala index 0fc873b38..8e8da7ee9 100644 --- a/core/src/main/scala/doric/syntax/TimestampColumns.scala +++ b/core/src/main/scala/doric/syntax/TimestampColumns.scala @@ -17,6 +17,7 @@ private[syntax] trait TimestampColumns { * All calls of current_timestamp within the same query return the same value. * * @group Timestamp Type + * @see [[org.apache.spark.sql.functions.current_timestamp]] */ def currentTimestamp(): TimestampColumn = f.current_timestamp().asDoric[Timestamp] @@ -32,6 +33,7 @@ private[syntax] trait TimestampColumns { * * @throws java.time.DateTimeException if invalid timeZone * @group Timestamp Type + * @see [[org.apache.spark.sql.functions.from_utc_timestamp]] */ def fromUtc(timeZone: StringColumn): TimestampColumn = (column.elem, timeZone.elem) @@ -47,6 +49,7 @@ private[syntax] trait TimestampColumns { * * @throws java.time.DateTimeException if invalid timeZone * @group Timestamp Type + * @see [[org.apache.spark.sql.functions.to_utc_timestamp]] */ def toUtc(timeZone: StringColumn): TimestampColumn = (column.elem, timeZone.elem) @@ -59,6 +62,7 @@ private[syntax] trait TimestampColumns { * Extracts the seconds as an integer from a given timestamp. * * @group Timestamp Type + * @see [[org.apache.spark.sql.functions.second]] */ def second: IntegerColumn = column.elem.map(f.second).toDC @@ -70,6 +74,7 @@ private[syntax] trait TimestampColumns { * `1 second`. Check `org.apache.spark.unsafe.types.CalendarInterval` for * valid duration identifiers. * @group Timestamp Type + * @see [[org.apache.spark.sql.functions.window]] */ def window(windowDuration: String): RowColumn = column.elem.map(x => f.window(x, windowDuration)).toDC @@ -97,6 +102,7 @@ private[syntax] trait TimestampColumns { * start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide * `startTime` as `15 minutes`. * @group Timestamp Type + * @see [[org.apache.spark.sql.functions.window]] */ def window( windowDuration: String, @@ -113,6 +119,7 @@ private[syntax] trait TimestampColumns { * @group Timestamp Type * @return * a Date Column without the hour + * @see [[org.apache.spark.sql.functions.to_date]] */ def toDate: DateColumn = column.elem.map(f.to_date).toDC @@ -122,6 +129,7 @@ private[syntax] trait TimestampColumns { * @group Timestamp Type * @return * a LocalDate Column without the hour + * @see [[org.apache.spark.sql.functions.to_date]] */ def toLocalDate: LocalDateColumn = column.elem.map(f.to_date).toDC } diff --git a/core/src/test/scala/doric/syntax/StringColumnsSpec.scala b/core/src/test/scala/doric/syntax/StringColumnsSpec.scala index 5ad0efaff..88c9966de 100644 --- a/core/src/test/scala/doric/syntax/StringColumnsSpec.scala +++ b/core/src/test/scala/doric/syntax/StringColumnsSpec.scala @@ -129,7 +129,7 @@ class StringColumnsSpec .toDF("col1") df.testColumns("col1")( - c => colString(c).initcap, + c => colString(c).initCap, c => f.initcap(f.col(c)), List("Hello World", "Ñañeñiño", "Tú Vas A Ir A Álaba", "1", null) .map(Option(_)) @@ -145,7 +145,7 @@ class StringColumnsSpec .toDF("col1") df.testColumns2("col1", "a")( - (c, str) => colString(c).instr(str.lit), + (c, str) => colString(c).inStr(str.lit), (c, str) => f.instr(f.col(c), str), List(Some(0), Some(2), Some(5), Some(0), None) ) From 7d59780554577916cb7618962d84197f3adc0de8 Mon Sep 17 00:00:00 2001 From: Eduardo Ruiz Date: Mon, 10 Jan 2022 17:01:27 +0100 Subject: [PATCH 2/2] doc: links for some functions Some functions scaladoc issue --> #135 --- .../scala/doric/syntax/BooleanColumns.scala | 4 +- .../main/scala/doric/syntax/DateColumns.scala | 16 +++---- .../scala/doric/syntax/StringColumns.scala | 48 ++++++++----------- .../scala/doric/syntax/TimestampColumns.scala | 14 +++--- 4 files changed, 37 insertions(+), 45 deletions(-) diff --git a/core/src/main/scala/doric/syntax/BooleanColumns.scala b/core/src/main/scala/doric/syntax/BooleanColumns.scala index 844a8af09..e9c683661 100644 --- a/core/src/main/scala/doric/syntax/BooleanColumns.scala +++ b/core/src/main/scala/doric/syntax/BooleanColumns.scala @@ -67,7 +67,7 @@ private[syntax] trait BooleanColumns { * * @throws java.lang.RuntimeException if the condition is false * @group Boolean Type - * @see [[org.apache.spark.sql.functions.assert_true]] + * @see [[org.apache.spark.sql.functions.assert_true(c:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.assert_true]] */ def assertTrue: NullColumn = column.elem.map(f.assert_true).toDC @@ -76,7 +76,7 @@ private[syntax] trait BooleanColumns { * * @throws java.lang.RuntimeException if the condition is false * @group Boolean Type - * @see [[org.apache.spark.sql.functions.assert_true]] + * @see [[org.apache.spark.sql.functions.assert_true(c:org\.apache\.spark\.sql\.Column,e:* org.apache.spark.sql.functions.assert_true]] */ def assertTrue(msg: StringColumn): NullColumn = (column.elem, msg.elem).mapN(f.assert_true).toDC diff --git a/core/src/main/scala/doric/syntax/DateColumns.scala b/core/src/main/scala/doric/syntax/DateColumns.scala index aaf122e84..c30e43889 100644 --- a/core/src/main/scala/doric/syntax/DateColumns.scala +++ b/core/src/main/scala/doric/syntax/DateColumns.scala @@ -33,7 +33,7 @@ private[syntax] trait DateColumns { * Date column after adding months * @note * Timestamp columns will be truncated to Date column - * @see [[org.apache.spark.sql.functions.add_months]] + * @see [[org.apache.spark.sql.functions.add_months(startDate:org\.apache\.spark\.sql\.Column,numMonths:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.add_months]] */ def addMonths(nMonths: IntegerColumn): DateColumn = (column.elem, nMonths.elem).mapN(f.add_months).toDC @@ -46,7 +46,7 @@ private[syntax] trait DateColumns { * @note * Timestamp columns will be truncated to Date column * @group Date & Timestamp Type - * @see [[org.apache.spark.sql.functions.date_add]] + * @see [[org.apache.spark.sql.functions.date_add(start:org\.apache\.spark\.sql\.Column,days:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.date_add]] */ def addDays(days: IntegerColumn): DateColumn = (column.elem, days.elem).mapN(f.date_add).toDC @@ -79,7 +79,7 @@ private[syntax] trait DateColumns { * @note * Timestamp columns will be truncated to Date column * @group Date & Timestamp Type - * @see [[org.apache.spark.sql.functions.date_sub]] + * @see [[org.apache.spark.sql.functions.date_sub(start:org\.apache\.spark\.sql\.Column,days:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.date_sub]] */ def subDays(days: IntegerColumn): DateColumn = (column.elem, days.elem).mapN(f.date_sub).toDC @@ -164,7 +164,7 @@ private[syntax] trait DateColumns { * @param dateCol * Date or Timestamp column * @group Date & Timestamp Type - * @see [[org.apache.spark.sql.functions.months_between]] + * @see [[org.apache.spark.sql.functions.months_between(end:org\.apache\.spark\.sql\.Column,start:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.months_between]] */ def monthsBetween(dateCol: DoricColumn[T]): DoubleColumn = (column.elem, dateCol.elem).mapN(f.months_between).toDC @@ -178,7 +178,7 @@ private[syntax] trait DateColumns { * If `roundOff` is set to true, the result is rounded off to 8 digits; * it is not rounded otherwise. * @group Date & Timestamp Type - * @see [[org.apache.spark.sql.functions.months_between]] + * @see [[org.apache.spark.sql.functions.months_between(end:org\.apache\.spark\.sql\.Column,start:org\.apache\.spark\.sql\.Column,roundOff:* org.apache.spark.sql.functions.months_between]] */ def monthsBetween( dateCol: DoricColumn[T], @@ -260,7 +260,7 @@ private[syntax] trait DateColumns { * A long * * @group Date & Timestamp Type - * @see [[org.apache.spark.sql.functions.unix_timestamp]] + * @see [[org.apache.spark.sql.functions.unix_timestamp(s:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.unix_timestamp]] */ def unixTimestamp: LongColumn = column.elem.map(f.unix_timestamp).toDC @@ -287,7 +287,7 @@ private[syntax] trait DateColumns { * Transform date to timestamp * * @group Date Type - * @see [[org.apache.spark.sql.functions.to_timestamp]] + * @see [[org.apache.spark.sql.functions.to_timestamp(s:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.to_timestamp]] */ def toTimestamp: TimestampColumn = column.elem.map(f.to_timestamp).toDC @@ -295,7 +295,7 @@ private[syntax] trait DateColumns { * Transform date to Instant * * @group Date Type - * @see [[org.apache.spark.sql.functions.to_timestamp]] + * @see [[org.apache.spark.sql.functions.to_timestamp(s:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.to_timestamp]] */ def toInstant: InstantColumn = column.elem.map(f.to_timestamp).toDC } diff --git a/core/src/main/scala/doric/syntax/StringColumns.scala b/core/src/main/scala/doric/syntax/StringColumns.scala index aea4b917d..c29a472ee 100644 --- a/core/src/main/scala/doric/syntax/StringColumns.scala +++ b/core/src/main/scala/doric/syntax/StringColumns.scala @@ -170,7 +170,8 @@ private[syntax] trait StringColumns { * @note * The position is not zero based, but 1 based index. returns 0 if substr * could not be found in str. - * @see [[org.apache.spark.sql.functions.locate]] + * @see org.apache.spark.sql.functions.locate + * @todo scaladoc link (issue #135) */ def locate( substr: StringColumn, @@ -209,7 +210,7 @@ private[syntax] trait StringColumns { * Trim the spaces from left end for the specified string value. * * @group String Type - * @see [[org.apache.spark.sql.functions.ltrim]] + * @see [[org.apache.spark.sql.functions.ltrim(e:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.ltrim]] */ def ltrim: StringColumn = s.elem.map(f.ltrim).toDC @@ -218,7 +219,7 @@ private[syntax] trait StringColumns { * string column. * * @group String Type - * @see [[org.apache.spark.sql.functions.ltrim]] + * @see [[org.apache.spark.sql.functions.ltrim(e:org\.apache\.spark\.sql\.Column,trimString:* org.apache.spark.sql.functions.ltrim]] */ def ltrim(trimString: StringColumn): StringColumn = (s.elem, trimString.elem) @@ -232,7 +233,7 @@ private[syntax] trait StringColumns { * byte position `pos` of `src` and proceeding for `len` bytes. * * @group String Type - * @see [[org.apache.spark.sql.functions.overlay]] + * @see [[org.apache.spark.sql.functions.overlay(src:org\.apache\.spark\.sql\.Column,replace:org\.apache\.spark\.sql\.Column,pos:org\.apache\.spark\.sql\.Column,len:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.overlay]] */ def overlay( replace: StringColumn, @@ -267,7 +268,7 @@ private[syntax] trait StringColumns { * with replacement. * * @group String Type - * @see [[org.apache.spark.sql.functions.regexp_replace]] + * @see [[org.apache.spark.sql.functions.regexp_replace(e:org\.apache\.spark\.sql\.Column,pattern:org\.apache\.spark\.sql\.Column,* org.apache.spark.sql.functions.regexp_replace]] */ def regexpReplace( pattern: StringColumn, @@ -302,7 +303,7 @@ private[syntax] trait StringColumns { * Trim the spaces from right end for the specified string value. * * @group String Type - * @see [[org.apache.spark.sql.functions.rtrim]] + * @see [[org.apache.spark.sql.functions.rtrim(e:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.rtrim]] */ def rtrim: StringColumn = s.elem.map(f.rtrim).toDC @@ -311,7 +312,7 @@ private[syntax] trait StringColumns { * string column. * * @group String Type - * @see [[org.apache.spark.sql.functions.rtrim]] + * @see [[org.apache.spark.sql.functions.rtrim(e:org\.apache\.spark\.sql\.Column,trimString:* org.apache.spark.sql.functions.rtrim]] */ def rtrim(trimString: StringColumn): StringColumn = (s.elem, trimString.elem) @@ -326,19 +327,6 @@ private[syntax] trait StringColumns { */ def soundex: StringColumn = s.elem.map(f.soundex).toDC - /** - * Splits str around matches of the given pattern. - * - * @param pattern - * a string representing a regular expression. The regex string should be - * a Java regular expression. - * - * @group String Type - * @see [[org.apache.spark.sql.functions.split]] - */ - def split(pattern: StringColumn): ArrayColumn[String] = - split(pattern, (-1).lit) - /** * Splits str around matches of the given pattern. * @@ -353,10 +341,12 @@ private[syntax] trait StringColumns { * contain all input beyond the last matched regex. * - __limit less than or equal to 0__: `regex` will be applied as many times as possible, * and the resulting array can be of any size. + * @see org.apache.spark.sql.functions.split + * @todo scaladoc link (issue #135) */ def split( pattern: StringColumn, - limit: IntegerColumn + limit: IntegerColumn = (-1).lit ): ArrayColumn[String] = (s.elem, pattern.elem, limit.elem) .mapN((str, p, l) => new Column(StringSplit(str.expr, p.expr, l.expr))) @@ -421,7 +411,7 @@ private[syntax] trait StringColumns { * Trim the spaces from both ends for the specified string column. * * @group String Type - * @see [[org.apache.spark.sql.functions.trim]] + * @see [[org.apache.spark.sql.functions.trim(e:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.trim]] */ def trim: StringColumn = s.elem.map(f.trim).toDC @@ -430,7 +420,7 @@ private[syntax] trait StringColumns { * column (literal). * * @group String Type - * @see [[org.apache.spark.sql.functions.trim]] + * @see [[org.apache.spark.sql.functions.trim(e:org\.apache\.spark\.sql\.Column,trimString:* org.apache.spark.sql.functions.trim]] */ def trim(trimString: StringColumn): StringColumn = (s.elem, trimString.elem) @@ -467,7 +457,7 @@ private[syntax] trait StringColumns { * String ends with. Returns a boolean column based on a string match. * * @group String Type - * @see [[org.apache.spark.sql.Column.endsWith]] + * @see [[org.apache.spark.sql.Column.endsWith(other:* org.apache.spark.sql.Column.endsWith]] */ def endsWith(dc: StringColumn): BooleanColumn = (s.elem, dc.elem).mapN(_.endsWith(_)).toDC @@ -499,7 +489,7 @@ private[syntax] trait StringColumns { * String starts with. Returns a boolean column based on a string match. * * @group String Type - * @see [[org.apache.spark.sql.Column.startsWith]] + * @see [[org.apache.spark.sql.Column.startsWith(other:* org.apache.spark.sql.Column.startsWith]] */ def startsWith(dc: StringColumn): BooleanColumn = (s.elem, dc.elem).mapN(_.startsWith(_)).toDC @@ -547,7 +537,7 @@ private[syntax] trait StringColumns { * A long * * @group String Type - * @see [[org.apache.spark.sql.functions.unix_timestamp]] + * @see [[org.apache.spark.sql.functions.unix_timestamp(s:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.unix_timestamp]] */ def unixTimestamp: LongColumn = s.elem.map(f.unix_timestamp).toDC @@ -559,7 +549,7 @@ private[syntax] trait StringColumns { * @throws java.lang.IllegalArgumentException if invalid pattern * * @group String Type - * @see [[org.apache.spark.sql.functions.unix_timestamp]] + * @see [[org.apache.spark.sql.functions.unix_timestamp(s:org\.apache\.spark\.sql\.Column,p:* org.apache.spark.sql.functions.unix_timestamp]] */ def unixTimestamp(pattern: StringColumn): LongColumn = (s.elem, pattern.elem) @@ -595,7 +585,7 @@ private[syntax] trait StringColumns { * @return * A date, or null if `e` was a string that could not be cast to a date * or `format` was an invalid format - * @see [[org.apache.spark.sql.functions.to_date]] + * @see [[org.apache.spark.sql.functions.to_date(e:org\.apache\.spark\.sql\.Column,fmt:* org.apache.spark.sql.functions.to_date]] */ def toDate(format: StringColumn): LocalDateColumn = (s.elem, format.elem) @@ -617,7 +607,7 @@ private[syntax] trait StringColumns { * @return * A timestamp, or null if `s` was a string that could not be cast to a * timestamp or `format` was an invalid format - * @see [[org.apache.spark.sql.functions.to_timestamp]] + * @see [[org.apache.spark.sql.functions.to_timestamp(s:org\.apache\.spark\.sql\.Column,fmt:* org.apache.spark.sql.functions.to_timestamp]] */ def toTimestamp(format: StringColumn): InstantColumn = (s.elem, format.elem) diff --git a/core/src/main/scala/doric/syntax/TimestampColumns.scala b/core/src/main/scala/doric/syntax/TimestampColumns.scala index 8e8da7ee9..414e86b02 100644 --- a/core/src/main/scala/doric/syntax/TimestampColumns.scala +++ b/core/src/main/scala/doric/syntax/TimestampColumns.scala @@ -33,7 +33,8 @@ private[syntax] trait TimestampColumns { * * @throws java.time.DateTimeException if invalid timeZone * @group Timestamp Type - * @see [[org.apache.spark.sql.functions.from_utc_timestamp]] + * @see org.apache.spark.sql.functions.from_utc_timestamp + * @todo scaladoc link (issue #135) */ def fromUtc(timeZone: StringColumn): TimestampColumn = (column.elem, timeZone.elem) @@ -49,7 +50,8 @@ private[syntax] trait TimestampColumns { * * @throws java.time.DateTimeException if invalid timeZone * @group Timestamp Type - * @see [[org.apache.spark.sql.functions.to_utc_timestamp]] + * @see org.apache.spark.sql.functions.to_utc_timestamp + * @todo scaladoc link (issue #135) */ def toUtc(timeZone: StringColumn): TimestampColumn = (column.elem, timeZone.elem) @@ -74,7 +76,7 @@ private[syntax] trait TimestampColumns { * `1 second`. Check `org.apache.spark.unsafe.types.CalendarInterval` for * valid duration identifiers. * @group Timestamp Type - * @see [[org.apache.spark.sql.functions.window]] + * @see [[org.apache.spark.sql.functions.window(timeColumn:org\.apache\.spark\.sql\.Column,windowDuration:String):* org.apache.spark.sql.functions.window]] */ def window(windowDuration: String): RowColumn = column.elem.map(x => f.window(x, windowDuration)).toDC @@ -102,7 +104,7 @@ private[syntax] trait TimestampColumns { * start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide * `startTime` as `15 minutes`. * @group Timestamp Type - * @see [[org.apache.spark.sql.functions.window]] + * @see [[org.apache.spark.sql.functions.window(timeColumn:org\.apache\.spark\.sql\.Column,windowDuration:String,slideDuration:String,startTime:* org.apache.spark.sql.functions.window]] */ def window( windowDuration: String, @@ -119,7 +121,7 @@ private[syntax] trait TimestampColumns { * @group Timestamp Type * @return * a Date Column without the hour - * @see [[org.apache.spark.sql.functions.to_date]] + * @see [[org.apache.spark.sql.functions.to_date(e:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.to_date]] */ def toDate: DateColumn = column.elem.map(f.to_date).toDC @@ -129,7 +131,7 @@ private[syntax] trait TimestampColumns { * @group Timestamp Type * @return * a LocalDate Column without the hour - * @see [[org.apache.spark.sql.functions.to_date]] + * @see [[org.apache.spark.sql.functions.to_date(e:org\.apache\.spark\.sql\.Column):* org.apache.spark.sql.functions.to_date]] */ def toLocalDate: LocalDateColumn = column.elem.map(f.to_date).toDC }