skip to Main Content
"events": {
    "a": {
        "eventName": "a",
        "eventIdentifier": "abc"
    },
    "b": {
        "eventName": "b",
        "eventIdentifier": "xyz"
    }
}

I want to split events "a" and "b" into 2 different records using spark sql only and above json should not in array.

Output:

+---------+-------------------+
|eventname|eventidentifier    |
+---------+-------------------+
|a        |abc                |
|b        |xyz                |
+---------+-------------------+

2

Answers


  1. To split the events "a" and "b" into two separate records using Spark SQL, you can use the explode function to transform the JSON structure. Here’s a way to achieve this:

    import org.apache.spark.sql.SparkSession
    import org.apache.spark.sql.functions._
    
    object JsonSplitExample {
    def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("JsonSplitExample")
      .master("local[*]")  // Change this for a cluster deployment
      .getOrCreate()
    
    import spark.implicits._
    
    val jsonString = """
      {
        "events": {
          "a": {
            "eventName": "a",
            "eventIdentifier": "abc"
          },
          "b": {
            "eventName": "b",
            "eventIdentifier": "xyz"
          }
        }
      }
    """
    
    val df = spark.read.json(Seq(jsonString).toDS())
    
    // Select the events field and explode it
    val explodedDF = df.selectExpr("explode(events) as event")
    
    // Select the fields from the exploded event structure
    val resultDF = explodedDF.select(
      col("event.eventName").alias("eventName"),
      col("event.eventIdentifier").alias("eventIdentifier")
    )
    
    resultDF.show()
    spark.stop()
    

    }
    }

    Login or Signup to reply.
  2. Use array to create rows & inline functions to flatten or explode array of rows.

    Logic – inline(array(events.*))

    scala> spark.table("input").show(false)
    +--------------------+
    |events              |
    +--------------------+
    |{{abc, a}, {xyz, b}}|
    +--------------------+
    
    
    scala> spark.table("input").toJSON.show(false)
    +--------------------------------------------------------------------------------------------------------+
    |value                                                                                                   |
    +--------------------------------------------------------------------------------------------------------+
    |{"events":{"a":{"eventIdentifier":"abc","eventName":"a"},"b":{"eventIdentifier":"xyz","eventName":"b"}}}|
    +--------------------------------------------------------------------------------------------------------+
    
    
    scala> spark.sql("""select inline(array(events.a,events.b)) from input""").show(false)
    +---------------+---------+
    |eventIdentifier|eventName|
    +---------------+---------+
    |abc            |a        |
    |xyz            |b        |
    +---------------+---------+
    

    OR

    scala> spark.sql("""select inline(array(events.*)) from input""").show(false)
    +---------------+---------+
    |eventIdentifier|eventName|
    +---------------+---------+
    |abc            |a        |
    |xyz            |b        |
    +---------------+---------+
    
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search