Python - Basic

Filterung von Daten

Alle Beispiele enthalten die filter() Funktion.

df bedeutet Data Frame.

 

df = spark.sql('SELECT * FROM datasource')

 

--Einfache Filterung

from pyspark.sql import functions as F

df.filter(df.date == "2023-03-01").display()

 

Alternative:

from pyspark.sql import functions as F

df.filter('date = "2023-02-01"').display()

 

 

-- Filterung "UND"

from pyspark.sql import functions as F

df.filter((F.col('date') == '2023-03-01') & (F.col('market') == 'DE')).display()

 

Alternative:

from pyspark.sql import functions as F

df.filter('partition_date = "2023-02-01"  and event_market = "DE"').display()

 

 

--Filterung "ODER"

from pyspark.sql import functions as F

df.filter((F.col('columnname1') == "ab") | (F.col('colunname1')== "cd")).display()

 

Alternative:

from pyspark.sql import functions as F

 

df.filter('columnname = "abc" or group = "b2b"').display()

 

from pyspark.sql import functions as F

 

df.filter(df.columnname.isin('abc','b2b')).display()

 

 

-- Filterung IS NULL

from pyspark.sql import functions as F

df.filter((F.col('year') == "2023") & (F.col('month').isNull())).display()

 

 

 

--Anzeige von MAX-Werten

from pyspark.sql import functions as F

df.select(F.max(F.col("columnname"))).show()

 

 

 

-- Filterung Größer als

from pyspark.sql import functions as F

df.filter(df.columnname > "1000").display()

 

Alternative:

from pyspark.sql import functions as F

 

df.filter('columnname > 1000 and columnname > 2000').display()

 

from pyspark.sql import functions as F

df.filter((F.col('date') >= '2023-03-01') & (F.col('columnname') == 'abc')).display()

 

Alternative:

from pyspark.sql import functions as F

df.filter('date >= "2023-02-01" and columname = "b2c"').display()

 

 

-- Filterung Enthält/Beginnt mit / Endet mit

from pyspark.sql import functions as F

df.filter(df.columnname.contains ('m')).display()

 

Weitere Befehle statt "contains":

  • .startswith
  • .endswith

Alternative - SQL-Like:

from pyspark.sql import functions as F

 

df.filter(df.group.like('m%')).display()

 

 

--Filterung mit regulären Ausdrücken

Ein Großer Vorteil gegenüber reinen SQL Funktionen

from pyspark.sql import functions as F

df.filter(df.columnname.rlike('b2c|b2b')).display()