Mysql - Squeeze records based on matching prior and current values

stack0114106
May 23, 2024
177 views
0 votes
2 Answers

I’m working on a SQL solution to squeeze the number of rows when there are matching "start" and "close" values across records. Here is an example input and the required output

Input

name	start	close
A	120	130
A	130	140
A	140	150
A	152	160
A	160	180
B	100	130
B	130	200
B	202	250
C	300	400

Required output

name	start	close
A	120	150
A	152	180
B	100	200
B	202	250
C	300	400

I tried with the lag() function, but I did not get the correct output. The null boundaries are crossed.

with 
     t1 ( 
            select name, start, close, lag(close) over(partition by name order by start) pclose from event 
        ),
     t2 (
            select * from t1 where 1 = (case when pclose is null then 1
                                             when start = pclose then 0 else 1 end)
        )
   select * from t2 order by name, start

Any standard complaint solution is welcome as I can port it easily to Spark.

Answers

Again, a classic gaps-and-islands problem.

Here’s how you can achieve the result:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, row_number, min, max, when, lit, sum
from pyspark.sql.types import DateType
from pyspark.sql.window import Window

# Initialize Spark session
spark = SparkSession.builder.appName("MergeIntervals").getOrCreate()

# Sample data

data = [
    ("A", 120, 130),
    ("A", 130, 140),
    ("A", 140, 150),
    ("A", 152, 160),
    ("A", 160, 180),
    ("B", 100, 130),
    ("B", 130, 200),
    ("B", 202, 250),
    ("C", 300, 400),
]

# Create DataFrame
df = spark.createDataFrame(data, ["name", "start", "close"])

df = (
    df.withColumn("row_num", row_number().over(Window.orderBy("start", "close")))
    .withColumn(
        "previous_close",
        max(col("close")).over(
            Window.partitionBy("name")
            .orderBy("name", "start", "close")
            .rowsBetween(Window.unboundedPreceding, Window.currentRow - 1)
        ),
    )
    .withColumn(
        "island_start_indicator",
        when(col("previous_close") >= col("start"), lit(0)).otherwise(lit(1)),
    )
    .withColumn(
        "island_id",
        sum("island_start_indicator").over(Window.orderBy("name", "start", "close")),
    )
    .withColumn("island_min_start", min("start").over(Window.partitionBy("island_id")))
    .withColumn("island_max_close", max("close").over(Window.partitionBy("island_id")))
    .select(
        col("name"),
        col("island_min_start").alias("start"),
        col("island_max_close").alias("close"),
    )
    .distinct()
)

df.show()
+----+-----+-----+
|name|start|close|
+----+-----+-----+
|   A|  120|  150|
|   A|  152|  180|
|   B|  100|  200|
|   B|  202|  250|
|   C|  300|  400|
+----+-----+-----+

Here’s the SQL version:

%sql
with sample_data as (
  select "A" as name, 120 as start, 130 as close union all
  select "A" as name, 130 as start, 140 as close union all
  select "A" as name, 140 as start, 150 as close union all
  select "A" as name, 152 as start, 160 as close union all
  select "A" as name, 160 as start, 180 as close union all
  select "B" as name, 100 as start, 130 as close union all
  select "B" as name, 130 as start, 200 as close union all
  select "B" as name, 202 as start, 250 as close union all
  select "C" as name, 300 as start, 400
),
windowed_data as (
  select
    *,
    max(close) over(
      partition by name
      order by
        name,
        start,
        close rows between unbounded preceding
        and 1 preceding
    ) as previous_close
  from
    sample_data
),
island as (
  select
    *,
    sum(island_start_indicator) over (
      order by
        name,
        start,
        close
    ) as island_id
  from
    (
      select
        *,
        case
          when previous_close >= start then 0
          else 1
        end as island_start_indicator
      from
        windowed_data
    )
)
select
  distinct name,
  min(start) over(partition by island_id) as start,
  max(close) over(partition by island_id) as close
from
  island

That is sequential, so the start of one record must be the close of another.
try this:

SET @row_number1 = 0;
SET @row_number2 = 0;
with event as 
(
    select 'A' as name, 120 as start, 130 as close
    union all
    select 'A', 130, 140
    union all
    select 'A', 140, 150
    union all
    select 'A', 152, 160
    union all
    select 'A', 160, 180
    union all
    select 'B', 100, 130
    union all
    select 'B', 130, 200
    union all
    select 'B', 202, 250
    union all
    select 'C', 300, 400
)
select a.name, a.start, b.close
from 
(
select (@row_number1:=@row_number1 + 1) AS row_num, e1.name 'sign1', e3.name 'sign3', e2.* /*, e1.*, e3.* */
from event e2 
left join event e1 on e1.name = e2.name and e1.close =e2.start
left join event e3 on e2.name = e3.name and e2.close =e3.start
, (SELECT @row_number:=0) AS temp 
where e1.name is null or e3.name is null 
order by e2.name, e2.start, e2.close
) as a
inner join
(
select (@row_number2:=@row_number2 + 1) AS row_num, e1.name 'sign1', e3.name 'sign3', e2.* /*, e1.*, e3.* */
from event e2 
left join event e1 on e1.name = e2.name and e1.close =e2.start
left join event e3 on e2.name = e3.name and e2.close =e3.start
, (SELECT @row_number:=0) AS temp 
where e1.name is null or e3.name is null 
order by e2.name, e2.start, e2.close
) as b
on a.name = b.name and ( a.sign1 is null and b.sign3 is null) and (a.row_num = b.row_num or a.row_num = b.row_num - 1 );

Only works when two events do not overlap.
Result:

name, start, close
'A', '120', '150'
'A', '152', '180'
'B', '100', '200'
'B', '202', '250'
'C', '300', '400'

Please signup or login to give your own answer.

Click here to cancel reply.

Mysql – Squeeze records based on matching prior and current values

Input

Required output

Answers