skip to Main Content

Given a row in an example table as:

Column A
abcccaaaaddeeaabbaa

How can we get the following output using MYSQL?

sequence occurrences
aaaa 1
ccc 1
dd 1
ee 1
aa 2
bb 1
a 1
b 1

Tried to compute this using MySQL. Cannot get the right logic for the same.

Edit: My latest attempt with BigQuery:

WITH Input AS (
SELECT ‘abcccaaaaddeeaabbaa’ AS str
),

Exploded AS (
SELECT
SUBSTR(str, pos, 1) AS char,
pos,
LAG(SUBSTR(str, pos, 1)) OVER (ORDER BY pos) AS prev_char
FROM Input,
UNNEST(GENERATE_ARRAY(1, LENGTH(str))) AS pos
),

Grouped AS (
SELECT
char,
pos,
IF(char = prev_char, 0, 1) AS new_group
FROM Exploded
),

Sequenced AS (
SELECT
char,
pos,
SUM(new_group) OVER (ORDER BY pos) AS sequence_id
FROM Grouped
),

Aggregated AS (
SELECT
sequence_id,
STRING_AGG(char, ” ORDER BY pos) AS sequence
FROM Sequenced
GROUP BY sequence_id
),

FinalCounts AS (
SELECT
sequence,
COUNT(*) AS occurrences
FROM Aggregated
GROUP BY sequence
)
select * from FinalCounts

2

Answers


  1. Split it up into one row per character, then identify runs of characters and summarize them:

    with recursive chars as (
      select bar,
        char_length(bar) idx,
        substr(bar, -1) ch
      from foo
    
      union all
    
      select bar, idx-1, substr(bar,idx-1,1)
      from chars
      where idx > 1
    ),
    charruns as (
      select ch, idx,
        count(1) over (order by idx)
          - count(1) over (partition by ch order by idx)
          run_id
      from chars
    ),
    runs as (
      select
        repeat(ch, count(1)) sequence,
        min(idx) idx
      from charruns
      group by ch, run_id
    ),
    occurrences as (
      select
        sequence,
        count(1) occurrences,
        min(idx) idx
      from runs
      group by sequence
    )
    select sequence, occurrences
    from occurrences
    order by char_length(sequence) desc, idx
    

    fiddle

    Login or Signup to reply.
  2. here is a shorter query

    SET @input_string = 'abcccaaaaddeeaabbaa';
    
    WITH RECURSIVE cte AS (
        -- init
        SELECT 
            @input_string AS original_string,
            CAST(REGEXP_SUBSTR(@input_string, '^(.)\1*') AS CHAR CHARACTER SET utf8mb4) AS prefix,
            CAST(REGEXP_REPLACE(@input_string, '^(.)\1*', '') AS CHAR CHARACTER SET utf8mb4) AS modified_string,
            1 AS step
        UNION ALL
        -- recursion 
        SELECT 
            original_string,
            CAST(REGEXP_SUBSTR(modified_string, '^(.)\1*') AS CHAR CHARACTER SET utf8mb4) AS prefix,
            CAST(REGEXP_REPLACE(modified_string, '^(.)\1*', '') AS CHAR CHARACTER SET utf8mb4) AS modified_string,
            step + 1 AS step
        FROM cte
        WHERE modified_string != '' AND REGEXP_SUBSTR(modified_string, '^(.)\1*') IS NOT NULL
    )
    SELECT 
        step,
        prefix,
        modified_string
    FROM cte;
    

    result:

    step    prefix  modified_string
    1   a   bcccaaaaddeeaabbaa
    2   b   cccaaaaddeeaabbaa
    3   ccc aaaaddeeaabbaa
    4   aaaa    ddeeaabbaa
    5   dd  eeaabbaa
    6   ee  aabbaa
    7   aa  bbaa
    8   bb  aa
    9   aa  
    

    sample dbfiddle

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search