skip to Main Content

I have the following PySpark DataFrame. I need to extract JSON data stored as strings within a column in my DataFrame, and subsequently create multiple rows based on the extracted data in PySpark.

Example Input:

response (string column)
{"Customer Data": [{"id": "5", "name": "Jony"}, {"id": "10", "name": "Jems"}]}

Expected output:

id (int) name (string)
5 Jony
10 Jems

Any ideas how to do this?

2

Answers


  1. Hope this fits your needs:

    from pyspark.sql import Row
    from pyspark.sql.types import StructField, StructType, StringType, IntegerType, ArrayType
    from pyspark.sql import functions as f
    
    data = [
        Row(response='{"Customer Data": [{"id": "5", "name": "Jony"}, {"id": "10", "name": "Jems"}]}'),
    ]
    
    schema = StructType([
        StructField(name="response", dataType=StringType())
    ])
    
    json_schema = StructType([
        StructField(
            name="Customer Data", 
            dataType=ArrayType(
                StructType([
                    StructField(name="id", dataType=StringType()),
                    StructField(name="name", dataType=StringType())
                ])
            )
        ),
    ])
    
    df = (
        spark.createDataFrame(data=data, schema=schema)
        .withColumn("parsed_json", f.from_json("response", schema=json_schema))
        .selectExpr("parsed_json.*")
        .withColumn("CustomerDataExploded", f.explode("Customer Data"))
        .select("CustomerDataExploded.*")
        .withColumn("id", f.col("id").cast(IntegerType()))
    )
    
    df.show()
    
    +---+----+
    | id|name|
    +---+----+
    |  5|Jony|
    | 10|Jems|
    +---+----+
    
    
    Login or Signup to reply.
  2. u can try this, it might help u:

    from pyspark.sql import SparkSession
    from pyspark.sql.functions import from_json, explode, col
    from pyspark.sql.types import StructType, StructField, StringType, ArrayType
    
    spark = SparkSession.builder.appName("json_extraction").getOrCreate()
    
    df = spark.createDataFrame(
        [("""{"Customer Data": [{"id": "5", "name": "Jony"}, {"id": "10", "name": "Jems"}]}""",)],
        ["response"]
    )
    
    schema = StructType([
        StructField("Customer Data", ArrayType(
            StructType([
                StructField("id", StringType()),
                StructField("name", StringType())
            ])
        ))
    ])
    
    df_parsed = df.withColumn("parsed", from_json("response", schema))
    
    df_exploded = df_parsed.select(
        explode(col("parsed.Customer Data")).alias("customer_data")
    ).select(
        col("customer_data.id").cast("int"),  # Cast the id to int as per expected output
        col("customer_data.name")
    )
    df_exploded.show()
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search