skip to Main Content

I’m working on a SQL solution to squeeze the number of rows when there are matching "start" and "close" values across records. Here is an example input and the required output

Input

name start close
A 120 130
A 130 140
A 140 150
A 152 160
A 160 180
B 100 130
B 130 200
B 202 250
C 300 400

Required output

name start close
A 120 150
A 152 180
B 100 200
B 202 250
C 300 400

I tried with the lag() function, but I did not get the correct output. The null boundaries are crossed.

with 
     t1 ( 
            select name, start, close, lag(close) over(partition by name order by start) pclose from event 
        ),
     t2 (
            select * from t1 where 1 = (case when pclose is null then 1
                                             when start = pclose then 0 else 1 end)
        )
   select * from t2 order by name, start

Any standard complaint solution is welcome as I can port it easily to Spark.

2

Answers


  1. Again, a classic gaps-and-islands problem.

    Here’s how you can achieve the result:

    from pyspark.sql import SparkSession
    from pyspark.sql.functions import col, row_number, min, max, when, lit, sum
    from pyspark.sql.types import DateType
    from pyspark.sql.window import Window
    
    # Initialize Spark session
    spark = SparkSession.builder.appName("MergeIntervals").getOrCreate()
    
    # Sample data
    
    data = [
        ("A", 120, 130),
        ("A", 130, 140),
        ("A", 140, 150),
        ("A", 152, 160),
        ("A", 160, 180),
        ("B", 100, 130),
        ("B", 130, 200),
        ("B", 202, 250),
        ("C", 300, 400),
    ]
    
    # Create DataFrame
    df = spark.createDataFrame(data, ["name", "start", "close"])
    
    df = (
        df.withColumn("row_num", row_number().over(Window.orderBy("start", "close")))
        .withColumn(
            "previous_close",
            max(col("close")).over(
                Window.partitionBy("name")
                .orderBy("name", "start", "close")
                .rowsBetween(Window.unboundedPreceding, Window.currentRow - 1)
            ),
        )
        .withColumn(
            "island_start_indicator",
            when(col("previous_close") >= col("start"), lit(0)).otherwise(lit(1)),
        )
        .withColumn(
            "island_id",
            sum("island_start_indicator").over(Window.orderBy("name", "start", "close")),
        )
        .withColumn("island_min_start", min("start").over(Window.partitionBy("island_id")))
        .withColumn("island_max_close", max("close").over(Window.partitionBy("island_id")))
        .select(
            col("name"),
            col("island_min_start").alias("start"),
            col("island_max_close").alias("close"),
        )
        .distinct()
    )
    
    df.show()
    +----+-----+-----+
    |name|start|close|
    +----+-----+-----+
    |   A|  120|  150|
    |   A|  152|  180|
    |   B|  100|  200|
    |   B|  202|  250|
    |   C|  300|  400|
    +----+-----+-----+
    

    Here’s the SQL version:

    %sql
    with sample_data as (
      select "A" as name, 120 as start, 130 as close union all
      select "A" as name, 130 as start, 140 as close union all
      select "A" as name, 140 as start, 150 as close union all
      select "A" as name, 152 as start, 160 as close union all
      select "A" as name, 160 as start, 180 as close union all
      select "B" as name, 100 as start, 130 as close union all
      select "B" as name, 130 as start, 200 as close union all
      select "B" as name, 202 as start, 250 as close union all
      select "C" as name, 300 as start, 400
    ),
    windowed_data as (
      select
        *,
        max(close) over(
          partition by name
          order by
            name,
            start,
            close rows between unbounded preceding
            and 1 preceding
        ) as previous_close
      from
        sample_data
    ),
    island as (
      select
        *,
        sum(island_start_indicator) over (
          order by
            name,
            start,
            close
        ) as island_id
      from
        (
          select
            *,
            case
              when previous_close >= start then 0
              else 1
            end as island_start_indicator
          from
            windowed_data
        )
    )
    select
      distinct name,
      min(start) over(partition by island_id) as start,
      max(close) over(partition by island_id) as close
    from
      island
    
    Login or Signup to reply.
  2. That is sequential, so the start of one record must be the close of another.
    try this:

    SET @row_number1 = 0;
    SET @row_number2 = 0;
    with event as 
    (
        select 'A' as name, 120 as start, 130 as close
        union all
        select 'A', 130, 140
        union all
        select 'A', 140, 150
        union all
        select 'A', 152, 160
        union all
        select 'A', 160, 180
        union all
        select 'B', 100, 130
        union all
        select 'B', 130, 200
        union all
        select 'B', 202, 250
        union all
        select 'C', 300, 400
    )
    select a.name, a.start, b.close
    from 
    (
    select (@row_number1:=@row_number1 + 1) AS row_num, e1.name 'sign1', e3.name 'sign3', e2.* /*, e1.*, e3.* */
    from event e2 
    left join event e1 on e1.name = e2.name and e1.close =e2.start
    left join event e3 on e2.name = e3.name and e2.close =e3.start
    , (SELECT @row_number:=0) AS temp 
    where e1.name is null or e3.name is null 
    order by e2.name, e2.start, e2.close
    ) as a
    inner join
    (
    select (@row_number2:=@row_number2 + 1) AS row_num, e1.name 'sign1', e3.name 'sign3', e2.* /*, e1.*, e3.* */
    from event e2 
    left join event e1 on e1.name = e2.name and e1.close =e2.start
    left join event e3 on e2.name = e3.name and e2.close =e3.start
    , (SELECT @row_number:=0) AS temp 
    where e1.name is null or e3.name is null 
    order by e2.name, e2.start, e2.close
    ) as b
    on a.name = b.name and ( a.sign1 is null and b.sign3 is null) and (a.row_num = b.row_num or a.row_num = b.row_num - 1 );
    

    Only works when two events do not overlap.
    Result:

    name, start, close
    'A', '120', '150'
    'A', '152', '180'
    'B', '100', '200'
    'B', '202', '250'
    'C', '300', '400'
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search