Postgresql - How to optimize complex sorting?

Majesty
September 18, 2023
122 views
3 votes
3 Answers

I’m building a language learning platform, here are some tables:

create table users
(  
    id                  bigserial primary key,
    ...
)

create table users_languages
(
    id                   bigserial primary key,
    user_id              bigint    not null
        constraint users_languages_user_id_foreign references users,
    level                varchar(255)         not null,
    lang_code            varchar(10)          not null,
    ...
)

So the users_languages keeps all the langs a user knows, level could be NATIVE or LEARN, lang_code is the ISO code.

I’m building a recommended search feature for a current user, depending on the languages he knows, so the result must be ordered by the following rules:

other user NATIVE lang = current user LEARN lang AND other user LEARN lang = current user NATIVE lang
other user NATIVE lang = current user LEARN lang
other user has at least one of current user langs

Here is what I figured out so far:

           nativeCode := current user native lang_code
           langCodes := current user learn lang_codes


            WITH user_lang_priority AS NOT MATERIALIZED (
                SELECT l.user_id, MIN(CASE 
                    WHEN l.level = 'NATIVE' AND l.lang_code IN(:langCodes) THEN
                      CASE
                        WHEN EXISTS (
                           SELECT ll.id FROM users_languages ll
                           WHERE ll.level != 'NATIVE'
                           AND ll.lang_code = :nativeCode
                           AND ll.user_id = l.user_id
                        ) THEN 1 ELSE 2
                      END
                    WHEN l.lang_code IN(:langCodes) THEN 3
                END) AS priority
                FROM users_languages l
                GROUP BY l.user_id
            )
            SELECT u.*
            FROM users u
            INNER JOIN user_lang_priority lp ON u.id = lp.user_id
            GROUP BY u.id
            ORDER BY lp.priority ASC, u.id DESC

The query seems to be returning the correct results, but it takes up to 2 seconds on a table with about 30k rows. Is there a way to speed it up?

UPD: the query plan

Sort  (cost=777435.38..777508.24 rows=29145 width=762)
"  Sort Key: (min(CASE WHEN (((l.level)::text = 'NATIVE'::text) AND ((l.lang_code)::text = ANY ('{fin,fre,ger}'::text[]))) THEN CASE WHEN (hashed SubPlan 2) THEN 1 ELSE 2 END WHEN ((l.lang_code)::text = ANY ('{rus,fre,ger}'::text[])) THEN 3 ELSE NULL::integer END)), u.id DESC"
  ->  Hash Join  (cost=1677.14..775274.14 rows=29145 width=762)
        Hash Cond: (l.user_id = u.id)
        ->  GroupAggregate  (cost=0.29..773229.32 rows=29145 width=12)
              Group Key: l.user_id
              ->  Index Scan using users_languages_user_id_index on users_languages l  (cost=0.29..3005.29 rows=84970 width=19)
              SubPlan 2
                ->  Bitmap Heap Scan on users_languages ll  (cost=299.15..1487.36 rows=14990 width=8)
                      Recheck Cond: ((lang_code)::text = 'eng'::text)
                      Filter: ((level)::text <> 'NATIVE'::text)
                      ->  Bitmap Index Scan on users_languages_lang_code_index  (cost=0.00..295.40 rows=22814 width=0)
                            Index Cond: ((lang_code)::text = 'eng'::text)
        ->  Hash  (cost=1253.60..1253.60 rows=33860 width=758)
              ->  Seq Scan on users u  (cost=0.00..1253.60 rows=33860 width=758)

Answers

- ChidiEkuma
- September 12, 2023 at 12:02 pm
- 0 votes
0
Consider creating indexes on the following columns:
1. users_languages.user_id: This should improve the join operation with the users table.
2. users_languages.level: To optimise the CASE statements.
3. users_languages.lang_code: to optimise the filtering operation and CASE statements.
After creating these indexes, re-run your query and check the performance. You can update the statistics using the ANALYZE command:
```
ANALYZE users_languages;
ANALYZE users;
```
Lastly, if the users_languages table is very large and this query is frequently executed, you might consider using a materialized view to precompute the results.
```
-- Create the Materialized View
CREATE MATERIALIZED VIEW user_lang_priority_mv AS
SELECT l.user_id, MIN(CASE 
   WHEN l.level = 'NATIVE' AND l.lang_code IN(:langCodes) THEN
     CASE
       WHEN EXISTS (
          SELECT ll.id FROM users_languages ll
          WHERE ll.level != 'NATIVE'
          AND ll.lang_code = :nativeCode
          AND ll.user_id = l.user_id
       ) THEN 1 ELSE 2
     END
   WHEN l.lang_code IN(:langCodes) THEN 3
END) AS priority
FROM users_languages l
GROUP BY l.user_id;

-- Create Index on the Materialized View to further enhance performance.
CREATE INDEX user_lang_priority_mv_idx ON user_lang_priority_mv (user_id, priority);
```
Depending on how frequently the underlying data changes, you’ll need to schedule regular refreshes of the materialized view.
Login or Signup to reply.

try following query(view):

CREATE MATERIALIZED VIEW user_lang_priority AS
SELECT l.user_id, MIN(CASE 
    WHEN l.level = 'NATIVE' AND l.lang_code IN (:langCodes) THEN
        CASE
            WHEN EXISTS (
                SELECT ll.id FROM users_languages ll
                WHERE ll.level != 'NATIVE'
                AND ll.lang_code = :nativeCode
                AND ll.user_id = l.user_id
            ) THEN 1 ELSE 2
        END
    WHEN l.lang_code IN (:langCodes) THEN 3
END) AS priority
FROM users_languages l
GROUP BY l.user_id;

If you need search for current user, possible simple query. Here one join and one groupping. I think, they are well supported by indexes.

with current_user_languages as(
  select * from users_languages l
  where user_id=1 -- this is current user Id
)
,cul_intersects as(
select cul.user_id user1, cul.level level1,cul.lang_code code1
  ,l2.user_id user2, l2.level level2,l2.lang_code code2
from current_user_languages cul
left join users_languages l2 
   on l2.lang_code=cul.lang_code and cul.user_id<>l2.user_id
)
,priorities as(
select user1,user2
  ,case when sum(case when level1='LEARN' and level2='NATIVE' then 1 else 0 end)>0 
 and sum(case when level1='NATIVE' and level2='LEARN' then 1 else 0 end)>0 then 1
       when sum(case when level2='NATIVE' and level1='LEARN' then 1 else 0 end)>0 then 2
       when count(*)>0 then 3
   end priority
  ,sum(case when level1='LEARN' and level2='NATIVE' then 1 else 0 end) priority1_1
  ,sum(case when level2='LEARN' and level1='NATIVE' then 1 else 0 end) priority1_2
  ,sum(case when level2='NATIVE' and level1='LEARN' then 1 else 0 end) priority2
  ,count(*) totalIntersect
from cul_intersects
group by user1,user2
)
select user1,user2, priority,priority1,priority2,totalintersect,username 
from priorities p
inner join users u on u.id=p.user2
order by user1,user2

For test data

insert into users values
 (1,'User1')
,(2,'User2')
,(3,'User3')
,(4,'User4')
,(5,'User5')
,(6,'User6')
;
insert into users_languages values
 (1,1,'NATIVE','EN')
,(2,1,'LEARN','FR')
,(3,2,'NATIVE','FR')
,(4,2,'LEARN','EN')
,(5,2,'LEARN','TUR')
,(6,3,'NATIVE','TUR')
,(7,4,'NATIVE','ITA')
,(8,5,'NATIVE','ITA')
,(9,5,'LEARN','FR')
,(10,5,'LEARN','TUR')
,(11,6,'NATIVE','DE')
;

Draft result is

user1	user2	priority	priority1	priority2	totalintersect	username
1	2	1	1	1	2	User2
1	5	3	0	0	1	User5

Query for all users

with all_intersects as(
select l1.user_id user1, l1.level level1,l1.lang_code code1
  ,l2.user_id user2, l2.level level2,l2.lang_code code2
from users_languages l1
left join users_languages l2 on l2.lang_code=l1.lang_code and l1.user_id<>l2.user_id
)
,priorities as(
select user1,user2
  ,case when sum(case when level1='LEARN' and level2='NATIVE' then 1 else 0 end)>0 
and sum(case when level1='NATIVE' and level2='LEARN' then 1 else 0 end)>0 then 1
       when sum(case when level2='NATIVE' and level1='LEARN' then 1 else 0 end)>0 then 2
       when count(*)>0 then 3
   end priority
  -- this is for clearity
  ,sum(case when level1='LEARN' and level2='NATIVE' then 1 else 0 end) priority1_1
  ,sum(case when level2='LEARN' and level1='NATIVE' then 1 else 0 end) priority1_2
  ,sum(case when level2='NATIVE' and level1='LEARN' then 1 else 0 end) priority2
  ,count(*) totalIntersect
from all_intersects
group by user1,user2
)
select user1,user2, priority,priority1,priority2,totalintersect,username 
from priorities p
inner join users u on u.id=p.user2
order by user1,user2

Result

user1	user2	priority	priority1_1	priority1_2	priority2	totalintersect	username
1	2	1	1	1	1	2	User2
1	5	3	0	0	0	1	User5
2	1	1	1	1	1	2	User1
2	3	2	1	0	1	1	User3
2	5	3	0	1	0	2	User5
3	2	3	0	1	0	1	User2
3	5	3	0	1	0	1	User5
4	5	3	0	0	0	1	User5
5	1	3	0	0	0	1	User1
5	2	2	1	0	1	2	User2
5	3	2	1	0	1	1	User3
5	4	3	0	0	0	1	User4

fiddle here

Please signup or login to give your own answer.

Click here to cancel reply.

Postgresql – How to optimize complex sorting?

Answers