Source code for pyspark.sql.column

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# mypy: disable-error-code="empty-body"

import sys
from typing import (
    overload,
    Any,
    TYPE_CHECKING,
    Union,
)

from pyspark.sql.utils import dispatch_col_method
from pyspark.sql.types import DataType
from pyspark.errors import PySparkValueError

if TYPE_CHECKING:
    from py4j.java_gateway import JavaObject
    from pyspark.sql._typing import LiteralType, DecimalLiteral, DateTimeLiteral
    from pyspark.sql.window import WindowSpec

__all__ = ["Column"]


[docs]class Column: """ A column in a DataFrame. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- Column instances can be created by >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) Select a column out of a DataFrame >>> df.name Column<'name'> >>> df["name"] Column<'name'> Create from an expression >>> df.age + 1 Column<...> >>> 1 / df.age Column<...> """ # HACK ALERT!! this is to reduce the backward compatibility concern, and returns # Spark Classic Column by default. This is NOT an API, and NOT supposed to # be directly invoked. DO NOT use this constructor. def __new__( cls, jc: "JavaObject", ) -> "Column": from pyspark.sql.classic.column import Column return Column.__new__(Column, jc) def __init__(self, jc: "JavaObject") -> None: self._jc = jc # arithmetic operators @dispatch_col_method def __neg__(self) -> "Column": ... @dispatch_col_method def __add__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __sub__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __mul__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __div__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __truediv__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __mod__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __radd__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __rsub__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __rmul__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __rdiv__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __rtruediv__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __rmod__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __pow__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __rpow__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... # logistic operators @dispatch_col_method def __eq__( # type: ignore[override] self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"], ) -> "Column": """binary function""" ... @dispatch_col_method def __ne__( # type: ignore[override] self, other: Any, ) -> "Column": """binary function""" ... @dispatch_col_method def __lt__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __le__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __ge__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __gt__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ...
[docs] @dispatch_col_method def eqNullSafe( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": """ Equality test that is safe for null values. .. versionadded:: 2.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other a value or :class:`Column` Examples -------- >>> from pyspark.sql import Row >>> df1 = spark.createDataFrame([ ... Row(id=1, value='foo'), ... Row(id=2, value=None) ... ]) >>> df1.select( ... df1['value'] == 'foo', ... df1['value'].eqNullSafe('foo'), ... df1['value'].eqNullSafe(None) ... ).show() +-------------+---------------+----------------+ |(value = foo)|(value <=> foo)|(value <=> NULL)| +-------------+---------------+----------------+ | true| true| false| | NULL| false| true| +-------------+---------------+----------------+ >>> df2 = spark.createDataFrame([ ... Row(value = 'bar'), ... Row(value = None) ... ]) >>> df1.join(df2, df1["value"] == df2["value"]).count() 0 >>> df1.join(df2, df1["value"].eqNullSafe(df2["value"])).count() 1 >>> df2 = spark.createDataFrame([ ... Row(id=1, value=float('NaN')), ... Row(id=2, value=42.0), ... Row(id=3, value=None) ... ]) >>> df2.select( ... df2['value'].eqNullSafe(None), ... df2['value'].eqNullSafe(float('NaN')), ... df2['value'].eqNullSafe(42.0) ... ).show() +----------------+---------------+----------------+ |(value <=> NULL)|(value <=> NaN)|(value <=> 42.0)| +----------------+---------------+----------------+ | false| true| false| | false| false| true| | true| false| false| +----------------+---------------+----------------+ Notes ----- Unlike Pandas, PySpark doesn't consider NaN values to be NULL. See the `NaN Semantics <https://spark.apache.org/docs/latest/sql-ref-datatypes.html#nan-semantics>`_ for details. """ ...
# `and`, `or`, `not` cannot be overloaded in Python, # so use bitwise operators as boolean operators @dispatch_col_method def __and__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __or__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __invert__(self) -> "Column": ... @dispatch_col_method def __rand__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... @dispatch_col_method def __ror__( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": ... # container operators @dispatch_col_method def __contains__(self, item: Any) -> None: raise PySparkValueError( errorClass="CANNOT_APPLY_IN_FOR_COLUMN", messageParameters={}, ) # bitwise operators
[docs] @dispatch_col_method def bitwiseOR( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": """ " Compute bitwise OR of this expression with another expression. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other a value or :class:`Column` to calculate bitwise or(|) with this :class:`Column`. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(a=170, b=75)]) >>> df.select(df.a.bitwiseOR(df.b)).collect() [Row((a | b)=235)] """ ...
[docs] @dispatch_col_method def bitwiseAND( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": """ Compute bitwise AND of this expression with another expression. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other a value or :class:`Column` to calculate bitwise and(&) with this :class:`Column`. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(a=170, b=75)]) >>> df.select(df.a.bitwiseAND(df.b)).collect() [Row((a & b)=10)] """ ...
[docs] @dispatch_col_method def bitwiseXOR( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": """ Compute bitwise XOR of this expression with another expression. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other a value or :class:`Column` to calculate bitwise xor(^) with this :class:`Column`. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(a=170, b=75)]) >>> df.select(df.a.bitwiseXOR(df.b)).collect() [Row((a ^ b)=225)] """ ...
[docs] @dispatch_col_method def getItem(self, key: Any) -> "Column": """ An expression that gets an item at position ``ordinal`` out of a list, or gets an item by key out of a dict. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- key a literal value, or a :class:`Column` expression. The result will only be true at a location if the item matches in the column. .. deprecated:: 3.0.0 :class:`Column` as a parameter is deprecated. Returns ------- :class:`Column` Column representing the item(s) got at position out of a list or by key out of a dict. Examples -------- >>> df = spark.createDataFrame([([1, 2], {"key": "value"})], ["l", "d"]) >>> df.select(df.l.getItem(0), df.d.getItem("key")).show() +----+------+ |l[0]|d[key]| +----+------+ | 1| value| +----+------+ """ ...
[docs] @dispatch_col_method def getField(self, name: Any) -> "Column": """ An expression that gets a field by name in a :class:`StructType`. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- name a literal value, or a :class:`Column` expression. The result will only be true at a location if the field matches in the Column. .. deprecated:: 3.0.0 :class:`Column` as a parameter is deprecated. Returns ------- :class:`Column` Column representing whether each element of Column got by name. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(r=Row(a=1, b="b"))]) >>> df.select(df.r.getField("b")).show() +---+ |r.b| +---+ | b| +---+ >>> df.select(df.r.a).show() +---+ |r.a| +---+ | 1| +---+ """ ...
[docs] @dispatch_col_method def withField(self, fieldName: str, col: "Column") -> "Column": """ An expression that adds/replaces a field in :class:`StructType` by name. .. versionadded:: 3.1.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- fieldName : str a literal value. The result will only be true at a location if any field matches in the Column. col : :class:`Column` A :class:`Column` expression for the column with `fieldName`. Returns ------- :class:`Column` Column representing whether each element of Column which field was added/replaced by fieldName. Examples -------- >>> from pyspark.sql import Row >>> from pyspark.sql.functions import lit >>> df = spark.createDataFrame([Row(a=Row(b=1, c=2))]) >>> df.withColumn('a', df['a'].withField('b', lit(3))).select('a.b').show() +---+ | b| +---+ | 3| +---+ >>> df.withColumn('a', df['a'].withField('d', lit(4))).select('a.d').show() +---+ | d| +---+ | 4| +---+ """ ...
[docs] @dispatch_col_method def dropFields(self, *fieldNames: str) -> "Column": """ An expression that drops fields in :class:`StructType` by name. This is a no-op if the schema doesn't contain field name(s). .. versionadded:: 3.1.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- fieldNames : str Desired field names (collects all positional arguments passed) The result will drop at a location if any field matches in the Column. Returns ------- :class:`Column` Column representing whether each element of Column with field dropped by fieldName. Examples -------- >>> from pyspark.sql import Row >>> from pyspark.sql.functions import col, lit >>> df = spark.createDataFrame([ ... Row(a=Row(b=1, c=2, d=3, e=Row(f=4, g=5, h=6)))]) >>> df.withColumn('a', df['a'].dropFields('b')).show() +-----------------+ | a| +-----------------+ |{2, 3, {4, 5, 6}}| +-----------------+ >>> df.withColumn('a', df['a'].dropFields('b', 'c')).show() +--------------+ | a| +--------------+ |{3, {4, 5, 6}}| +--------------+ This method supports dropping multiple nested fields directly e.g. >>> df.withColumn("a", col("a").dropFields("e.g", "e.h")).show() +--------------+ | a| +--------------+ |{1, 2, 3, {4}}| +--------------+ However, if you are going to add/replace multiple nested fields, it is preferred to extract out the nested struct before adding/replacing multiple fields e.g. >>> df.select(col("a").withField( ... "e", col("a.e").dropFields("g", "h")).alias("a") ... ).show() +--------------+ | a| +--------------+ |{1, 2, 3, {4}}| +--------------+ """ ...
[docs] @dispatch_col_method def __getattr__(self, item: Any) -> "Column": """ An expression that gets an item at position ``ordinal`` out of a list, or gets an item by key out of a dict. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- item a literal value. Returns ------- :class:`Column` Column representing the item got by key out of a dict. Examples -------- >>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"]) >>> df.select(df.d.key).show() +------+ |d[key]| +------+ | value| +------+ """ ...
[docs] @dispatch_col_method def __getitem__(self, k: Any) -> "Column": """ An expression that gets an item at position ``ordinal`` out of a list, or gets an item by key out of a dict. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- k a literal value, or a slice object without step. Returns ------- :class:`Column` Column representing the item got by key out of a dict, or substrings sliced by the given slice object. Examples -------- >>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"]) >>> df.select(df.l[slice(1, 3)], df.d['key']).show() +---------------+------+ |substr(l, 1, 3)|d[key]| +---------------+------+ | abc| value| +---------------+------+ """ ...
@dispatch_col_method def __iter__(self) -> None: ... # string methods
[docs] @dispatch_col_method def contains( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": """ Contains the other element. Returns a boolean :class:`Column` based on a string match. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other string in line. A value as a literal or a :class:`Column`. Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.contains('o')).collect() [Row(age=5, name='Bob')] """ ...
[docs] @dispatch_col_method def startswith( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": """ String starts with. Returns a boolean :class:`Column` based on a string match. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other : :class:`Column` or str string at start of line (do not use a regex `^`) Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.startswith('Al')).collect() [Row(age=2, name='Alice')] >>> df.filter(df.name.startswith('^Al')).collect() [] """ ...
[docs] @dispatch_col_method def endswith( self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"] ) -> "Column": """ String ends with. Returns a boolean :class:`Column` based on a string match. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other : :class:`Column` or str string at end of line (do not use a regex `$`) Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.endswith('ice')).collect() [Row(age=2, name='Alice')] >>> df.filter(df.name.endswith('ice$')).collect() [] """ ...
[docs] @dispatch_col_method def like(self: "Column", other: str) -> "Column": """ SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other : str a SQL LIKE pattern See Also -------- pyspark.sql.Column.rlike Returns ------- :class:`Column` Column of booleans showing whether each element in the Column is matched by SQL LIKE pattern. Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.like('Al%')).collect() [Row(age=2, name='Alice')] """ ...
[docs] @dispatch_col_method def rlike(self: "Column", other: str) -> "Column": """ SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex match. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other : str an extended regex expression Returns ------- :class:`Column` Column of booleans showing whether each element in the Column is matched by extended regex expression. Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.rlike('ice$')).collect() [Row(age=2, name='Alice')] """ ...
[docs] @dispatch_col_method def ilike(self: "Column", other: str) -> "Column": """ SQL ILIKE expression (case insensitive LIKE). Returns a boolean :class:`Column` based on a case insensitive match. .. versionadded:: 3.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other : str a SQL LIKE pattern See Also -------- pyspark.sql.Column.rlike Returns ------- :class:`Column` Column of booleans showing whether each element in the Column is matched by SQL LIKE pattern. Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.ilike('%Ice')).collect() [Row(age=2, name='Alice')] """ ...
@overload def substr(self, startPos: int, length: int) -> "Column": ... @overload def substr(self, startPos: "Column", length: "Column") -> "Column": ...
[docs] @dispatch_col_method def substr(self, startPos: Union[int, "Column"], length: Union[int, "Column"]) -> "Column": """ Return a :class:`Column` which is a substring of the column. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- startPos : :class:`Column` or int start position length : :class:`Column` or int length of the substring Returns ------- :class:`Column` Column representing whether each element of Column is substr of origin Column. Examples -------- Example 1. Using integers for the input arguments. >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.name.substr(1, 3).alias("col")).collect() [Row(col='Ali'), Row(col='Bob')] Example 2. Using columns for the input arguments. >>> df = spark.createDataFrame( ... [(3, 4, "Alice"), (2, 3, "Bob")], ["sidx", "eidx", "name"]) >>> df.select(df.name.substr(df.sidx, df.eidx).alias("col")).collect() [Row(col='ice'), Row(col='ob')] """ ...
[docs] @dispatch_col_method def isin(self, *cols: Any) -> "Column": """ A boolean expression that is evaluated to true if the value of this expression is contained by the evaluated values of the arguments. .. versionadded:: 1.5.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- cols : Any The values to compare with the column values. The result will only be true at a location if any value matches in the Column. Returns ------- :class:`Column` Column of booleans showing whether each element in the Column is contained in cols. Examples -------- >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob"), (8, "Mike")], ["age", "name"]) Example 1: Filter rows with names in the specified values >>> df[df.name.isin("Bob", "Mike")].show() +---+----+ |age|name| +---+----+ | 5| Bob| | 8|Mike| +---+----+ Example 2: Filter rows with ages in the specified list >>> df[df.age.isin([1, 2, 3])].show() +---+-----+ |age| name| +---+-----+ | 2|Alice| +---+-----+ Example 3: Filter rows with names not in the specified values >>> df[~df.name.isin("Alice", "Bob")].show() +---+----+ |age|name| +---+----+ | 8|Mike| +---+----+ """ ...
# order
[docs] @dispatch_col_method def asc(self) -> "Column": """ Returns a sort expression based on the ascending order of the column. .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.asc()).collect() [Row(name='Alice'), Row(name='Tom')] """ ...
[docs] @dispatch_col_method def asc_nulls_first(self) -> "Column": """ Returns a sort expression based on ascending order of the column, and null values return before non-null values. .. versionadded:: 2.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame( ... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.asc_nulls_first()).collect() [Row(name=None), Row(name='Alice'), Row(name='Tom')] """ ...
[docs] @dispatch_col_method def asc_nulls_last(self) -> "Column": """ Returns a sort expression based on ascending order of the column, and null values appear after non-null values. .. versionadded:: 2.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame( ... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect() [Row(name='Alice'), Row(name='Tom'), Row(name=None)] """ ...
[docs] @dispatch_col_method def desc(self) -> "Column": """ Returns a sort expression based on the descending order of the column. .. versionadded:: 2.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.desc()).collect() [Row(name='Tom'), Row(name='Alice')] """ ...
[docs] @dispatch_col_method def desc_nulls_first(self) -> "Column": """ Returns a sort expression based on the descending order of the column, and null values appear before non-null values. .. versionadded:: 2.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame( ... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.desc_nulls_first()).collect() [Row(name=None), Row(name='Tom'), Row(name='Alice')] """ ...
[docs] @dispatch_col_method def desc_nulls_last(self) -> "Column": """ Returns a sort expression based on the descending order of the column, and null values appear after non-null values. .. versionadded:: 2.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame( ... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.desc_nulls_last()).collect() [Row(name='Tom'), Row(name='Alice'), Row(name=None)] """ ...
[docs] @dispatch_col_method def isNull(self) -> "Column": """ True if the current expression is null. .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)]) >>> df.filter(df.height.isNull()).collect() [Row(name='Alice', height=None)] """ ...
[docs] @dispatch_col_method def isNotNull(self) -> "Column": """ True if the current expression is NOT null. .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)]) >>> df.filter(df.height.isNotNull()).collect() [Row(name='Tom', height=80)] """ ...
[docs] @dispatch_col_method def isNaN(self) -> "Column": """ True if the current expression is NaN. .. versionadded:: 4.0.0 Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame( ... [Row(name='Tom', height=80.0), Row(name='Alice', height=float('nan'))]) >>> df.filter(df.height.isNaN()).collect() [Row(name='Alice', height=nan)] """ ...
[docs] @dispatch_col_method def alias(self, *alias: str, **kwargs: Any) -> "Column": """ Returns this column aliased with a new name or names (in the case of expressions that return more than one column, such as explode). .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- alias : str desired column names (collects all positional arguments passed) Other Parameters ---------------- metadata: dict a dict of information to be stored in ``metadata`` attribute of the corresponding :class:`StructField <pyspark.sql.types.StructField>` (optional, keyword only argument) .. versionchanged:: 2.2.0 Added optional ``metadata`` argument. Returns ------- :class:`Column` Column representing whether each element of Column is aliased with new name or names. Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.age.alias("age2")).collect() [Row(age2=2), Row(age2=5)] >>> df.select(df.age.alias("age3", metadata={'max': 99})).schema['age3'].metadata['max'] 99 """ ...
[docs] @dispatch_col_method def name(self, *alias: str, **kwargs: Any) -> "Column": """ :func:`name` is an alias for :func:`alias`. .. versionadded:: 2.0.0 """ ...
[docs] @dispatch_col_method def cast(self, dataType: Union[DataType, str]) -> "Column": """ Casts the column into type ``dataType``. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- dataType : :class:`DataType` or str a DataType or Python string literal with a DDL-formatted string to use when parsing the column to the same type. Returns ------- :class:`Column` Column representing whether each element of Column is cast into new type. Examples -------- >>> from pyspark.sql.types import StringType >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.age.cast("string").alias('ages')).collect() [Row(ages='2'), Row(ages='5')] >>> df.select(df.age.cast(StringType()).alias('ages')).collect() [Row(ages='2'), Row(ages='5')] """ ...
[docs] @dispatch_col_method def try_cast(self, dataType: Union[DataType, str]) -> "Column": """ This is a special version of `cast` that performs the same operation, but returns a NULL value instead of raising an error if the invoke method throws exception. .. versionadded:: 4.0.0 Parameters ---------- dataType : :class:`DataType` or str a DataType or Python string literal with a DDL-formatted string to use when parsing the column to the same type. Returns ------- :class:`Column` Column representing whether each element of Column is cast into new type. Examples -------- Example 1: Cast with a Datatype >>> from pyspark.sql.types import LongType >>> df = spark.createDataFrame( ... [(2, "123"), (5, "Bob"), (3, None)], ["age", "name"]) >>> df.select(df.name.try_cast(LongType())).show() +----+ |name| +----+ | 123| |NULL| |NULL| +----+ Example 2: Cast with a DDL string >>> df = spark.createDataFrame( ... [(2, "123"), (5, "Bob"), (3, None)], ["age", "name"]) >>> df.select(df.name.try_cast("double")).show() +-----+ | name| +-----+ |123.0| | NULL| | NULL| +-----+ """ ...
[docs] @dispatch_col_method def astype(self, dataType: Union[DataType, str]) -> "Column": """ :func:`astype` is an alias for :func:`cast`. .. versionadded:: 1.4.0 """ ...
[docs] @dispatch_col_method def between( self, lowerBound: Union["Column", "LiteralType", "DateTimeLiteral", "DecimalLiteral"], upperBound: Union["Column", "LiteralType", "DateTimeLiteral", "DecimalLiteral"], ) -> "Column": """ Check if the current column's values are between the specified lower and upper bounds, inclusive. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- lowerBound : :class:`Column`, int, float, string, bool, datetime, date or Decimal The lower boundary value, inclusive. upperBound : :class:`Column`, int, float, string, bool, datetime, date or Decimal The upper boundary value, inclusive. Returns ------- :class:`Column` A new column of boolean values indicating whether each element in the original column is within the specified range (inclusive). Examples -------- Using between with integer values. >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.name, df.age.between(2, 4)).show() +-----+---------------------------+ | name|((age >= 2) AND (age <= 4))| +-----+---------------------------+ |Alice| true| | Bob| false| +-----+---------------------------+ Using between with string values. >>> df = spark.createDataFrame([("Alice", "A"), ("Bob", "B")], ["name", "initial"]) >>> df.select(df.name, df.initial.between("A", "B")).show() +-----+-----------------------------------+ | name|((initial >= A) AND (initial <= B))| +-----+-----------------------------------+ |Alice| true| | Bob| true| +-----+-----------------------------------+ Using between with float values. >>> df = spark.createDataFrame( ... [(2.5, "Alice"), (5.5, "Bob")], ["height", "name"]) >>> df.select(df.name, df.height.between(2.0, 5.0)).show() +-----+-------------------------------------+ | name|((height >= 2.0) AND (height <= 5.0))| +-----+-------------------------------------+ |Alice| true| | Bob| false| +-----+-------------------------------------+ Using between with date values. >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame( ... [("Alice", "2023-01-01"), ("Bob", "2023-02-01")], ["name", "date"]) >>> df = df.withColumn("date", sf.to_date(df.date)) >>> df.select(df.name, df.date.between("2023-01-01", "2023-01-15")).show() +-----+-----------------------------------------------+ | name|((date >= 2023-01-01) AND (date <= 2023-01-15))| +-----+-----------------------------------------------+ |Alice| true| | Bob| false| +-----+-----------------------------------------------+ >>> from datetime import date >>> df.select(df.name, df.date.between(date(2023, 1, 1), date(2023, 1, 15))).show() +-----+-------------------------------------------------------------+ | name|((date >= DATE '2023-01-01') AND (date <= DATE '2023-01-15'))| +-----+-------------------------------------------------------------+ |Alice| true| | Bob| false| +-----+-------------------------------------------------------------+ Using between with timestamp values. >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame( ... [("Alice", "2023-01-01 10:00:00"), ("Bob", "2023-02-01 10:00:00")], ... schema=["name", "timestamp"]) >>> df = df.withColumn("timestamp", sf.to_timestamp(df.timestamp)) >>> df.select(df.name, df.timestamp.between("2023-01-01", "2023-02-01")).show() +-----+---------------------------------------------------------+ | name|((timestamp >= 2023-01-01) AND (timestamp <= 2023-02-01))| +-----+---------------------------------------------------------+ |Alice| true| | Bob| false| +-----+---------------------------------------------------------+ >>> df.select(df.name, df.timestamp.between("2023-01-01", "2023-02-01 12:00:00")).show() +-----+------------------------------------------------------------------+ | name|((timestamp >= 2023-01-01) AND (timestamp <= 2023-02-01 12:00:00))| +-----+------------------------------------------------------------------+ |Alice| true| | Bob| true| +-----+------------------------------------------------------------------+ """ ...
[docs] @dispatch_col_method def when(self, condition: "Column", value: Any) -> "Column": """ Evaluates a list of conditions and returns one of multiple possible result expressions. If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions. .. versionadded:: 1.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- condition : :class:`Column` a boolean :class:`Column` expression. value a literal value, or a :class:`Column` expression. Returns ------- :class:`Column` Column representing whether each element of Column is in conditions. Examples -------- Example 1: Using :func:`when` with conditions and values to create a new Column >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> result = df.select(df.name, sf.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0)) >>> result.show() +-----+------------------------------------------------------------+ | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0 END| +-----+------------------------------------------------------------+ |Alice| -1| | Bob| 1| +-----+------------------------------------------------------------+ Example 2: Chaining multiple :func:`when` conditions >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(1, "Alice"), (4, "Bob"), (6, "Charlie")], ["age", "name"]) >>> result = df.select( ... df.name, ... sf.when(df.age < 3, "Young").when(df.age < 5, "Middle-aged").otherwise("Old") ... ) >>> result.show() +-------+---------------------------------------------------------------------------+ | name|CASE WHEN (age < 3) THEN Young WHEN (age < 5) THEN Middle-aged ELSE Old END| +-------+---------------------------------------------------------------------------+ | Alice| Young| | Bob| Middle-aged| |Charlie| Old| +-------+---------------------------------------------------------------------------+ Example 3: Using literal values as conditions >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> result = df.select( ... df.name, sf.when(sf.lit(True), 1).otherwise( ... sf.raise_error("unreachable")).alias("when")) >>> result.show() +-----+----+ | name|when| +-----+----+ |Alice| 1| | Bob| 1| +-----+----+ See Also -------- pyspark.sql.functions.when """ ...
[docs] @dispatch_col_method def otherwise(self, value: Any) -> "Column": """ Evaluates a list of conditions and returns one of multiple possible result expressions. If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions. .. versionadded:: 1.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- value a literal value, or a :class:`Column` expression. Returns ------- :class:`Column` Column representing whether each element of Column is unmatched conditions. Examples -------- >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.name, sf.when(df.age > 3, 1).otherwise(0)).show() +-----+-------------------------------------+ | name|CASE WHEN (age > 3) THEN 1 ELSE 0 END| +-----+-------------------------------------+ |Alice| 0| | Bob| 1| +-----+-------------------------------------+ See Also -------- pyspark.sql.functions.when """ ...
[docs] @dispatch_col_method def over(self, window: "WindowSpec") -> "Column": """ Define a windowing column. .. versionadded:: 1.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- window : :class:`WindowSpec` Returns ------- :class:`Column` Examples -------- >>> from pyspark.sql import Window >>> window = ( ... Window.partitionBy("name") ... .orderBy("age") ... .rowsBetween(Window.unboundedPreceding, Window.currentRow) ... ) >>> from pyspark.sql.functions import rank, min, desc >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.withColumn( ... "rank", rank().over(window) ... ).withColumn( ... "min", min('age').over(window) ... ).sort(desc("age")).show() +---+-----+----+---+ |age| name|rank|min| +---+-----+----+---+ | 5| Bob| 1| 5| | 2|Alice| 1| 2| +---+-----+----+---+ """ ...
@dispatch_col_method def __nonzero__(self) -> None: ... @dispatch_col_method def __bool__(self) -> None: ... @dispatch_col_method def __repr__(self) -> str: ...
def _test() -> None: import doctest from pyspark.sql import SparkSession import pyspark.sql.column globs = pyspark.sql.column.__dict__.copy() spark = SparkSession.builder.master("local[4]").appName("sql.column tests").getOrCreate() globs["spark"] = spark (failure_count, test_count) = doctest.testmod( pyspark.sql.column, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF, ) spark.stop() if failure_count: sys.exit(-1) if __name__ == "__main__": _test()