skip to Main Content

I am implementing an entity-attribute-value model to store nationwide property data that will be updated daily: 130 million properties, will probably store billions of values. We also want to search this data on arbitrary attributes, hence using EAV so we can index the values for each attribute and search on them (instead of one massive table with a slew of indexes). I’m using ruby and rails, so queries are built with rails’ various macros.

Here is the schema:

CREATE TABLE `properties` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `aid` int NOT NULL,
  `deleted_at` datetime DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `index_properties_on_deleted_at_and_aid` (`deleted_at`,`aid`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb3;

CREATE TABLE `property_attributes` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `name` varchar(100) DEFAULT NULL,
  `value_type` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `index_property_attributes_on_name` (`name`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb3;

CREATE TABLE `property_value_booleans` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `aid` int NOT NULL,
  `attribute_id` bigint NOT NULL,
  `value` tinyint(1) DEFAULT NULL,
  `deleted_at` datetime DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `index_pvb_on_daav` (`deleted_at`,`aid`,`attribute_id`,`value`),
  KEY `index_pvb_on_dav` (`deleted_at`,`attribute_id`,`value`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb3;

CREATE TABLE `property_value_dates` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `aid` int NOT NULL,
  `attribute_id` bigint NOT NULL,
  `value` date DEFAULT NULL,
  `deleted_at` datetime DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `index_pvd_on_daav` (`deleted_at`,`aid`,`attribute_id`,`value`),
  KEY `index_pvd_on_dav` (`deleted_at`,`attribute_id`,`value`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb3;

CREATE TABLE `property_value_numerics` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `aid` int NOT NULL,
  `attribute_id` bigint NOT NULL,
  `value` decimal(18,6) DEFAULT NULL,
  `deleted_at` datetime DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `index_pvn_on_daav` (`deleted_at`,`aid`,`attribute_id`,`value`),
  KEY `index_pvn_on_dav` (`deleted_at`,`attribute_id`,`value`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb3;

CREATE TABLE `property_value_strings` (
  `id` bigint NOT NULL AUTO_INCREMENT,
  `aid` int NOT NULL,
  `attribute_id` bigint NOT NULL,
  `value` varchar(255) DEFAULT NULL,
  `created_at` datetime DEFAULT NULL,
  `deleted_at` datetime DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `index_pvs_on_daav` (`deleted_at`,`aid`,`attribute_id`,`value`),
  KEY `index_pvs_on_dav` (`deleted_at`,`attribute_id`,`value`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb3;

The following is the query which is relatively simple in nature (conceptually), but running this will periodically cause mysql to hang in the "statistics" state for what seems like an indefinite amount of time:

SELECT COUNT(*) FROM `properties` WHERE `properties`.`deleted_at` IS NULL AND 
(exists (SELECT 1 FROM `property_value_strings` as t_74653 WHERE t_74653.`deleted_at` IS NULL AND t_74653.`attribute_id` = 48 AND t_74653.`value` = 'NC' AND (t_74653.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_a9e53 WHERE t_a9e53.`deleted_at` IS NULL AND t_a9e53.`attribute_id` = 14 AND t_a9e53.`value` = 'Wake' AND (t_a9e53.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_a225b WHERE t_a225b.`deleted_at` IS NULL AND t_a225b.`attribute_id` = 163 AND t_a225b.`value` IN ('181', '366', '369', '378', '385', '386', '388') AND (t_a225b.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_booleans` as t_2b45b WHERE t_2b45b.`deleted_at` IS NULL AND t_2b45b.`attribute_id` = 2 AND t_2b45b.`value` = TRUE AND (t_2b45b.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_dates` as t_54497 WHERE t_54497.`deleted_at` IS NULL AND t_54497.`attribute_id` = 174 AND t_54497.`value` BETWEEN '2017-05-16' AND '2017-06-15' AND (t_54497.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_numerics` as t_a525e WHERE t_a525e.`deleted_at` IS NULL AND t_a525e.`attribute_id` = 175 AND t_a525e.`value` BETWEEN 200000.0 AND 1000000.0 AND (t_a525e.`aid` = `properties`.`aid`))) AND 

# Just checking for existence of these values, not concerned with actual value.
(exists (SELECT 1 FROM `property_value_strings` as t_79c04 WHERE t_79c04.`deleted_at` IS NULL AND t_79c04.`attribute_id` = 39 AND (t_79c04.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_3936d WHERE t_3936d.`deleted_at` IS NULL AND t_3936d.`attribute_id` = 47 AND (t_3936d.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_07d8d WHERE t_07d8d.`deleted_at` IS NULL AND t_07d8d.`attribute_id` = 49 AND (t_07d8d.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_numerics` as t_0506f WHERE t_0506f.`deleted_at` IS NULL AND t_0506f.`attribute_id` = 54 AND (t_0506f.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_numerics` as t_2172b WHERE t_2172b.`deleted_at` IS NULL AND t_2172b.`attribute_id` = 55 AND (t_2172b.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_dates` as t_b2d33 WHERE t_b2d33.`deleted_at` IS NULL AND t_b2d33.`attribute_id` = 174 AND (t_b2d33.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_numerics` as t_cc807 WHERE t_cc807.`deleted_at` IS NULL AND t_cc807.`attribute_id` = 175 AND (t_cc807.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_34123 WHERE t_34123.`deleted_at` IS NULL AND t_34123.`attribute_id` = 0 AND (t_34123.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_74163 WHERE t_74163.`deleted_at` IS NULL AND t_74163.`attribute_id` = 99 AND (t_74163.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_f3994 WHERE t_f3994.`deleted_at` IS NULL AND t_f3994.`attribute_id` = 107 AND (t_f3994.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_ee85d WHERE t_ee85d.`deleted_at` IS NULL AND t_ee85d.`attribute_id` = 108 AND (t_ee85d.`aid` = `properties`.`aid`))) AND 
(exists (SELECT 1 FROM `property_value_strings` as t_2f858 WHERE t_2f858.`deleted_at` IS NULL AND t_2f858.`attribute_id` = 109 AND (t_2f858.`aid` = `properties`.`aid`)))

I can run analyze table property_value_strings, which sometimes makes the query finish, but not sure if that is a feasible solution for production usage. When the query works it is very fast (on a sample of 100,000 properties locally, this query returns 7,400 in 43ms).

One thing I have tried so far is to set optimizer_search_depth to 0, but that doesn’t seem to have fixed anything.

My questions are:

  1. What is causing mysql to hang in the "statistics" state? I assume it’s the number of exists statements being used, but why? Can this be avoided using these queries?
  2. Is this the most efficient way to search these value tables? I’m totally open to rearranging the query if there is a better way to search.

4

Answers


  1. The issues you are encountering arise from the complexity of the query, the extensive use of EXISTS subqueries, and potential inefficiencies in indexing and optimisation within MySQL.

    What Causes MySQL to Stall in the "Statistics" State?

    The "statistics" state signifies that MySQL is calculating execution plans for the query. This phase can be particularly resource-intensive when:

    • Numerous subqueries (EXISTS) are involved, each requiring filtered scans over large datasets.
    • MySQL evaluates multiple combinations of join strategies due to the complexity of the query.
    • Indexing is insufficient, resulting in excessive data scans.

    Is This the Most Efficient Way to Query the Data?

    Not entirely. While EXISTS is suitable for some situations, the sheer volume of subqueries in your example risks overburdening the optimiser. This can lead to significant inefficiencies, particularly when working with billions of records.

    To improve this I would:

    1. Use composite indexes for the property_value_* tables covering (attribute_id, value, aid, deleted_at)
    2. Replace multiple EXISTS subqueries with JOINs and conditional aggregation like
        SELECT COUNT(DISTINCT properties.aid)
        FROM properties
        LEFT JOIN property_value_strings pvs1 
            ON pvs1.aid = properties.aid 
            AND pvs1.deleted_at IS NULL 
            AND pvs1.attribute_id = 48 
            AND pvs1.value = 'NC'
        LEFT JOIN property_value_strings pvs2 
            ON pvs2.aid = properties.aid 
            AND pvs2.deleted_at IS NULL 
            AND pvs2.attribute_id = 14 
            AND pvs2.value = 'Wake'
        LEFT JOIN property_value_numerics pvn 
            ON pvn.aid = properties.aid 
            AND pvn.deleted_at IS NULL 
            AND pvn.attribute_id = 175 
            AND pvn.value BETWEEN 200000.0 AND 1000000.0
        -- Add additional joins as necessary
        WHERE properties.deleted_at IS NULL
        GROUP BY properties.aid
        HAVING COUNT(pvs1.id) > 0
           AND COUNT(pvs2.id) > 0
           AND COUNT(pvn.id) > 0;
    
    1. Partition the tables – partitioning can significantly reduce the amount of data scanned in each query
    Login or Signup to reply.
  2. It is a good practice to always use schema name with Table in a SQL query. It will also help you to make this query faster.

    Login or Signup to reply.
  3. Not spectacular but the following should yield 5 and replace 5 EXISTs. With an index on attribute_id this should be faster both in execution as in analysis time.

    5 = (SELECT COUNT(*)
    FROM `property_value_strings` as t
    WHERE t.`deleted_at` IS NULL
        AND t.`attribute_id` IN (39, 47, 49, 54, 55)
        AND t.`aid` = `properties`.`aid`)
    
    Login or Signup to reply.
  4. I created a random set of test data, and found your query to be SLOW (more than two minutes).

    After rewriting it, it was fast, under 1 second.

    SELECT COUNT(*)
    FROM `properties` 
    WHERE `properties`.`deleted_at` IS NULL AND 
        (exists (SELECT 1 FROM `property_value_strings` as t_74653 
                 WHERE t_74653.`deleted_at` IS NULL AND t_74653.`attribute_id` = 48 AND t_74653.`value` = 'NC' AND (t_74653.`aid` = `properties`.`aid`))) AND 
        (exists (SELECT 1 FROM `property_value_strings` as t_a9e53 
                 WHERE t_a9e53.`deleted_at` IS NULL AND t_a9e53.`attribute_id` = 14 AND t_a9e53.`value` = 'Wake' AND (t_a9e53.`aid` = `properties`.`aid`))) AND 
        (exists (SELECT 1 FROM `property_value_strings` as t_a225b 
                 WHERE t_a225b.`deleted_at` IS NULL AND t_a225b.`attribute_id` = 163 AND t_a225b.`value` IN ('181', '366', '369', '378', '385', '386', '388') AND (t_a225b.`aid` = `properties`.`aid`))) AND 
        (exists (SELECT 1 FROM `property_value_booleans` as t_2b45b 
                 WHERE t_2b45b.`deleted_at` IS NULL AND t_2b45b.`attribute_id` = 2 AND t_2b45b.`value` = TRUE AND (t_2b45b.`aid` = `properties`.`aid`))) AND 
        (exists (SELECT 1 FROM `property_value_dates` as t_54497 
                 WHERE t_54497.`deleted_at` IS NULL AND t_54497.`attribute_id` = 174 AND t_54497.`value` BETWEEN '2017-05-16' AND '2017-06-15' AND (t_54497.`aid` = `properties`.`aid`))) AND 
        (exists (SELECT 1 FROM `property_value_numerics` as t_a525e 
                 WHERE t_a525e.`deleted_at` IS NULL AND t_a525e.`attribute_id` = 175 AND t_a525e.`value` BETWEEN 200000.0 AND 1000000.0 AND (t_a525e.`aid` = `properties`.`aid`))) AND 
        # Just checking for existence of these values, not concerned with actual value.
           (select count(*) FROM (
               select attribute_id, count(*) 
               from property_value_strings 
               where `attribute_id` in (39,47,49,0,99,107,108,109)
               group by attribute_id 
               having count(*)>0
            ) x)=8 AND  
           (select count(*) FROM ( 
               select attribute_id, count(*) 
               from property_value_numerics
               where `attribute_id` in (54,55)
               group by attribute_id 
               having count(*)>0
           ) x) =2 
    ;
    

    Please note that under "Just checking for existence of these values, not concerned with actual value", the values 174 and 175 are not checked, because they are already checked with a value.

    My test set is not very large:

    select 'properties',count(*) from properties p
    union all
    select 'boolans',count(*) from property_value_booleans pvb
    union all
    select 'dates',count(*) from property_value_dates pvd 
    union all
    select 'numerics',count(*) from property_value_numerics pvn 
    union all
    select 'strings',count(*) from property_value_strings pvs 
    
    properties count(*)
    properties 234
    boolans 1201
    dates 1802
    numerics 2403
    strings 10626
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search