skip to Main Content

The task is to exclude row by a group where the "word" field is NULL

Tables

CREATE TABLE `search_phrases` (
  `id` int unsigned NOT NULL AUTO_INCREMENT,
  `phrase` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
  `frequency` bigint NOT NULL,
  PRIMARY KEY (`id`),
  KEY `search_phrases_phrase_index` (`phrase`),
  KEY `search_phrases_frequency_index` (`frequency`)
) ENGINE=InnoDB AUTO_INCREMENT=1000001 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

CREATE TABLE `search_words` (
  `id` int unsigned NOT NULL AUTO_INCREMENT,
  `word` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
  PRIMARY KEY (`id`),
  UNIQUE KEY `search_words_word_unique` (`word`)
) ENGINE=InnoDB AUTO_INCREMENT=128287 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

CREATE TABLE `search_combinable` (
  `search_phrase_id` int unsigned NOT NULL,
  `search_word_id` int unsigned NOT NULL,
  KEY `search_combinable_search_phrase_id_foreign` (`search_phrase_id`),
  KEY `search_combinable_search_word_id_foreign` (`search_word_id`),
  CONSTRAINT `search_combinable_search_phrase_id_foreign` FOREIGN KEY (`search_phrase_id`) REFERENCES `search_phrases` (`id`),
  CONSTRAINT `search_combinable_search_word_id_foreign` FOREIGN KEY (`search_word_id`) REFERENCES `search_words` (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

CREATE TABLE `requested_words` (
  `id` int unsigned NOT NULL AUTO_INCREMENT,
  `word` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL,
  `ts` timestamp NULL DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `requested_words_word_index` (`word`)
) ENGINE=InnoDB AUTO_INCREMENT=436 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

/* some sample data */
INSERT INTO search_phrases VALUES (1, 'men''s shorts', 1031588), (2, 'red shorts', 456000), (3, 'green shorts', 436000);
INSERT INTO search_words VALUES (1, 'men''s'), (2, 'shorts'), (3, 'red'), (4, 'green');
INSERT INTO search_combinable VALUES (1, 1), (1, 2), (2, 3), (2, 2), (3, 4), (3, 2);
INSERT INTO requested_words VALUES (1, 'men''s', '2023-03-01 07:45:49'), (2, 'shorts', '2023-03-01 07:45:49'), (3, 'red', '2023-03-01 07:45:49');

As a result, there should be phrases that contain words from the requested words table

select sp.id, sp.phrase, sp.frequency from search_phrases as sp
inner join search_combinable as sc on sc.search_phrase_id = sp.id
inner join search_words as sw on sw.id = sc.search_word_id
inner join requested_words as rw on rw.word = sw.word
where sp.id not in (
    select sp.id from search_phrases as temp_sp
    inner join search_combinable as sc on sc.search_phrase_id = temp_sp.id
    inner join search_words as sw on sw.id = sc.search_word_id
    left join requested_words as rw on rw.word = sw.word
    where rw.word is null and temp_sp.id = sp.id
)
group by sp.id
order by sp.frequency desc

The execution time is about 29 seconds. It grows in proportion to the increase in the "request_table"

| id | select_type | table   | partitions | type   | possible_keys                                                                       | key                                        | key_len | ref                         | rows | filtered | Extra                                        |
|----|-------------|---------|------------|--------|-------------------------------------------------------------------------------------|--------------------------------------------|---------|-----------------------------|------|----------|----------------------------------------------|
| 1  | SIMPLE      | rw      |            | index  | requested_words_word_index                                                          | requested_words_word_index                 | 1022    |                             | 435  | 100      | Using index; Using temporary; Using filesort |
| 1  | SIMPLE      | sw      |            | eq_ref | PRIMARY,search_words_word_unique                                                    | search_words_word_unique                   | 1022    | laravel.rw.word             | 1    | 100      | Using index                                  |
| 1  | SIMPLE      | sc      |            | ref    | search_combinable_search_phrase_id_foreign,search_combinable_search_word_id_foreign | search_combinable_search_word_id_foreign   | 4       | laravel.sw.id               | 17   | 100      |                                              |
| 1  | SIMPLE      | sp      |            | eq_ref | PRIMARY,search_phrases_phrase_index,search_phrases_frequency_index                  | PRIMARY                                    | 4       | laravel.sc.search_phrase_id | 1    | 100      |                                              |
| 1  | SIMPLE      | temp_sp |            | eq_ref | PRIMARY                                                                             | PRIMARY                                    | 4       | laravel.sc.search_phrase_id | 1    | 100      | Using where; Not exists; Using index         |
| 1  | SIMPLE      | sc      |            | ref    | search_combinable_search_phrase_id_foreign,search_combinable_search_word_id_foreign | search_combinable_search_phrase_id_foreign | 4       | laravel.sc.search_phrase_id | 2    | 100      |                                              |
| 1  | SIMPLE      | sw      |            | eq_ref | PRIMARY                                                                             | PRIMARY                                    | 4       | laravel.sc.search_word_id   | 1    | 100      |                                              |
| 1  | SIMPLE      | rw      |            | ref    | requested_words_word_index                                                          | requested_words_word_index                 | 1022    | laravel.sw.word             | 4    | 100      | Using where; Using index                     |

Dataset

search_phrases

id phrase frequency
1 men’s shorts 1031588
2 red shorts 456000
3 green shorts 436000

search_combinable

search_phrase_id search_word_id
1 1
1 2
2 3
2 2
3 4
3 2

search_words

id word
1 men’s
2 shorts
3 red
4 green

requested_words

id word ts
1 men’s 2023-03-01 07:45:49
2 shorts 2023-03-01 07:45:49
3 red 2023-03-01 07:45:49

in the response, I expect to see the phrases: men’s shorts and red shorts

P.S
After adding the primary key to the search_combinable table

CREATE TABLE `search_combinable` (
  `search_phrase_id` int unsigned NOT NULL,
  `search_word_id` int unsigned NOT NULL,
  PRIMARY KEY (`search_word_id`,`search_phrase_id`),
  KEY `search_combinable_search_word_id_search_phrase_id_index` (`search_word_id`,`search_phrase_id`),
  KEY `search_combinable_search_phrase_id_foreign` (`search_phrase_id`),
  CONSTRAINT `search_combinable_search_phrase_id_foreign` FOREIGN KEY (`search_phrase_id`) REFERENCES `search_phrases` (`id`),
  CONSTRAINT `search_combinable_search_word_id_foreign` FOREIGN KEY (`search_word_id`) REFERENCES `search_words` (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

select sp.id, sp.phrase, sp.frequency
from search_phrases as sp
where not exists (
    select 1
    from search_phrases sp2
    left join search_combinable as sc on sc.search_phrase_id = sp2.id
    left join search_words as sw on sw.id = sc.search_word_id
    left join requested_words as rw on rw.word = sw.word
    where sp2.id = sp.id
    and rw.word is null
)
order by sp.frequency desc;
id select_type table partitions type possible_keys key key_len ref rows filtered Extra
1 SIMPLE sw index PRIMARY search_words_word_unique 1022 43359 100 Using index; Using temporary; Using filesort
1 SIMPLE rw ref requested_words_word_index requested_words_word_index 1022 laravel.sw.word 4 100 Using index
1 SIMPLE sc ref PRIMARY,search_combinable_search_word_id_search_phrase_id_index,search_combinable_search_phrase_id_foreign search_combinable_search_word_id_search_phrase_id_index 4 laravel.sw.id 18 100 Using index
1 SIMPLE sp eq_ref PRIMARY,search_phrases_phrase_index,search_phrases_frequency_index PRIMARY 4 laravel.sc.search_phrase_id 1 100

2

Answers


  1. Remove all the redundancy, all of those tables are involved in the query – you don’t also need another set of joins etc. in the where clause:

    select 
        temp_sp.id, temp_sp.phrase, temp_sp.frequency 
    from search_phrases as temp_sp
    inner join search_combinable as sc on sc.search_phrase_id = temp_sp.id
    inner join search_words as sw on sw.id = sc.search_word_id
    inner join requested_words as rw on rw.word = sw.word
    where rw.word IS NOT NULL
    
    Login or Signup to reply.
  2. Your junction table search_combinable has no PK and is not indexed properly. It should be:

    CREATE TABLE `search_combinable` (
        `search_phrase_id` int unsigned NOT NULL,
        `search_word_id` int unsigned NOT NULL,
        PRIMARY KEY (`search_phrase_id`, `search_word_id`),
        KEY `idx_search_combinable_word_phrase` (`search_word_id`, `search_phrase_id`),
        CONSTRAINT `fk_search_combinable_search_phrase_id` FOREIGN KEY (`search_phrase_id`) REFERENCES `search_phrases` (`id`),
        CONSTRAINT `fk_search_combinable_search_word_id` FOREIGN KEY (`search_word_id`) REFERENCES `search_words` (`id`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
    

    It may be that the PK and reverse should be the other way around. You can experiment with your dataset and known queries.

    The best I have come up with so far is:

    select sp.*
    from (
        select sc.search_phrase_id
        from search_combinable sc
        join search_words sw on sc.search_word_id = sw.id
        left join requested_words rw on sw.word = rw.word
        group by sc.search_phrase_id
        having count(distinct sw.id) = count(distinct rw.word)
    ) t
    join search_phrases sp on t.search_phrase_id = sp.id
    order by sp.frequency desc;
    

    With your test dataset it returns 47 rows in 0.93 – 0.97 seconds.

    I managed to squeeze a tiny bit off by pre-joining search_words and requested_words but YMMV:

    select sp.*
    from (
        select sc.search_phrase_id
        from search_combinable sc
        join (
            select sw.id, count(rw.id) > 0 AS req
            from search_words sw
            left join requested_words rw on sw.word = rw.word
            group by sw.id
        ) t on sc.search_word_id = t.id
        group by sc.search_phrase_id
        having count(t.id) = count(nullif(t.req, 0))
    ) t
    join search_phrases sp on t.search_phrase_id = sp.id
    order by sp.frequency desc;
    

    0.68 – 0.70 seconds

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search