Alle Beispiele enthalten die filter() Funktion.
df bedeutet Data Frame.
df = spark.sql('SELECT * FROM datasource')
--Einfache Filterung
from pyspark.sql import functions as F
df.filter(df.date == "2023-03-01").display()
Alternative:
from pyspark.sql import functions as F
df.filter('date = "2023-02-01"').display()
-- Filterung "UND"
from pyspark.sql import functions as F
df.filter((F.col('date') == '2023-03-01') & (F.col('market') == 'DE')).display()
Alternative:
from pyspark.sql import functions as F
df.filter('partition_date = "2023-02-01" and event_market = "DE"').display()
--Filterung "ODER"
from pyspark.sql import functions as F
df.filter((F.col('columnname1') == "ab") | (F.col('colunname1')== "cd")).display()
Alternative:
from pyspark.sql import functions as F
df.filter('columnname = "abc" or group = "b2b"').display()
from pyspark.sql import functions as F
df.filter(df.columnname.isin('abc','b2b')).display()
-- Filterung IS NULL
from pyspark.sql import functions as F
df.filter((F.col('year') == "2023") & (F.col('month').isNull())).display()
--Anzeige von MAX-Werten
from pyspark.sql import functions as F
df.select(F.max(F.col("columnname"))).show()
-- Filterung Größer als
from pyspark.sql import functions as F
df.filter(df.columnname > "1000").display()
Alternative:
from pyspark.sql import functions as F
df.filter('columnname > 1000 and columnname > 2000').display()
from pyspark.sql import functions as F
df.filter((F.col('date') >= '2023-03-01') & (F.col('columnname') == 'abc')).display()
Alternative:
from pyspark.sql import functions as F
df.filter('date >= "2023-02-01" and columname = "b2c"').display()
-- Filterung Enthält/Beginnt mit / Endet mit
from pyspark.sql import functions as F
df.filter(df.columnname.contains ('m')).display()
Weitere Befehle statt "contains":
Alternative - SQL-Like:
from pyspark.sql import functions as F
df.filter(df.group.like('m%')).display()
--Filterung mit regulären Ausdrücken
Ein Großer Vorteil gegenüber reinen SQL Funktionen
from pyspark.sql import functions as F
df.filter(df.columnname.rlike('b2c|b2b')).display()
Header: istock/solarseven