sql >> Database teknologi >  >> RDS >> PostgreSQL

Smertefuldt langsom Postgres-forespørgsel ved hjælp af WHERE på mange tilstødende rækker

Trin 1:Brug en vinduesfunktion til at få tilstødende poster, undgå den smertefulde selv-join (12 tabeller er meget tæt på grænsen, hvor geqo overtager):

copy(
WITH stuff AS (
    SELECT   c1.id , c1.source, c1.word
    , LEAD ( c1.word, 1) OVER (www) AS c2w
    , LEAD (c1.word, 2) OVER (www) AS c3w
    , LEAD ( c1.word, 3) OVER (www) AS c4w
    , LEAD (c1.lemma, 3) OVER (www) AS c4l
    , LEAD (c1.pos, 3) OVER (www) AS c4p
    , LEAD (c1.pos, 4) OVER (www) AS c5p
    , LEAD (c1.word, 4) OVER (www) AS c5w
    , LEAD (c1.word, 5) OVER (www) AS c6w
    , LEAD (c1.lemma, 5) OVER (www) AS c6l
    , LEAD (c1.word, 6) OVER (www) AS c7w
    , LEAD (c1.pos, 6) OVER (www) AS c7p
    , LEAD (c1.word, 7) OVER (www) AS c8w
    , LEAD (c1.word, 8) OVER (www) AS c9w
    , LEAD (c1.lemma, 8) OVER (www) AS c9l
    , LEAD (c1.pos, 8) OVER (www) AS c9p
    , LEAD (c1.word, 9) OVER (www) AS c10w
    , LEAD (c1.word, 10) OVER (www) AS c11w
    FROM orderedflatcorpus AS c1
    WINDOW www AS (ORDER BY id)
    )
SELECT id ,  source, word
    , c2w
    , c3w
    , c4w
    , c4l
    , c4p
    , c5w
    , c6w
    , c7w
    , c8w
    , c9w
    , c9l
    , c9p
    , c10w
    , c11w
FROM stuff
WHERE 1=1
AND c4p LIKE 'v%'
AND c5p = 'appge'
AND c6l = 'way'
AND c7p LIKE 'i%'
AND c8w = 'the'
AND c9p LIKE 'n%'
ORDER BY id
)
-- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
TO '/tmp/OUTPUT2.csv' DELIMITER E'\t' csv header;
 

Trin 2:[datamodel] {word,lemma, pos}-kolonnerne ser ud til at være en gruppe med lav kardinalitet, du kan presse dem ud i et separat token/lemma/pos-table:

    -- An index to speedup the unique extraction and final update
    -- (the index will be dropped automatically
    -- once the columns are dropped)
    CREATE INDEX ON tmp.orderedflatcorpus (word, lemma, pos );

    ANALYZE tmp.orderedflatcorpus;
    -- table containing the "squeezed out" domain
    CREATE TABLE tmp.words AS
     SELECT DISTINCT  word, lemma, pos
     FROM tmp.orderedflatcorpus
            ;
    ALTER TABLE tmp.words
     ADD COLUMN id SERIAL NOT NULL PRIMARY KEY;

    ALTER TABLE tmp.words
     ADD UNIQUE (word , lemma, pos );

    -- The original table needs an FK "link" to the new table
    ALTER TABLE tmp.orderedflatcorpus
      ADD column words_id INTEGER -- NOT NULL
      REFERENCES tmp.words(id)
      ;
    -- FK constraints are helped a lot by a supportive index.
    CREATE INDEX orderedflatcorpus_words_id_fk ON tmp.orderedflatcorpus (words_id)
     ;
    ANALYZE tmp.orderedflatcorpus;
    ANALYZE tmp.words;
    -- Initialize the FK column in the original table.
    --  we need NOT DISTINCT FROM here, since the joined
    --  columns could contain NULLs , which MUST compare equal.
    -- ------------------------------------------------------
    UPDATE tmp.orderedflatcorpus dst
       SET  words_id = src.id
      FROM tmp.words src
     WHERE src.word IS NOT DISTINCT FROM dst.word
       AND dst.lemma IS NOT DISTINCT FROM src.lemma
       AND dst.pos IS NOT DISTINCT FROM src.pos
            ;
    ALTER TABLE tmp.orderedflatcorpus
     DROP column word
     , DROP column lemma
     , DROP column pos
            ;
 

Og den nye forespørgsel med et JOIN til ord-tabellen:

copy(
WITH stuff AS (
    SELECT   c1.id , c1.source, w.word
    , LEAD ( w.word, 1) OVER (www) AS c2w
    , LEAD (w.word, 2) OVER (www) AS c3w
    , LEAD ( w.word, 3) OVER (www) AS c4w
    , LEAD (w.lemma, 3) OVER (www) AS c4l
    , LEAD (w.pos, 3) OVER (www) AS c4p
    , LEAD (w.pos, 4) OVER (www) AS c5p
    , LEAD (w.word, 4) OVER (www) AS c5w
    , LEAD (w.word, 5) OVER (www) AS c6w
    , LEAD (w.lemma, 5) OVER (www) AS c6l
    , LEAD (w.word, 6) OVER (www) AS c7w
    , LEAD (w.pos, 6) OVER (www) AS c7p
    , LEAD (w.word, 7) OVER (www) AS c8w
    , LEAD (w.word, 8) OVER (www) AS c9w
    , LEAD (w.lemma, 8) OVER (www) AS c9l
    , LEAD (w.pos, 8) OVER (www) AS c9p
    , LEAD (w.word, 9) OVER (www) AS c10w
    , LEAD (w.word, 10) OVER (www) AS c11w
    FROM orderedflatcorpus AS c1
    JOIN words w ON w.id=c1.words_id
    WINDOW www AS (ORDER BY c1.id)
    )
SELECT id ,  source, word
    , c2w , c3w
    , c4w , c4l , c4p
    , c5w
    , c6w
    , c7w
    , c8w
    , c9w , c9l , c9p
    , c10w
    , c11w
FROM stuff
WHERE 1=1
AND c4p LIKE 'v%'
AND c5p = 'appge'
AND c6l = 'way'
AND c7p LIKE 'i%'
AND c8w = 'the'
AND c9p LIKE 'n%'
ORDER BY id
)
-- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header;
TO '/tmp/OUTPUT3.csv' DELIMITER E'\t' csv header;
 

Bemærk:Jeg får to linjer i outputtet, fordi jeg lempede betingelserne lidt for meget ...

Opdater :den første forespørgsel, undgå CTE

copy( SELECT id , source, word , c2w , c3w , c4w , c4l , c4p , c5w , c6w , c7w , c8w , c9w , c9l , c9p , c10w , c11w FROM ( SELECT c1.id , c1.source, c1.word , LEAD ( c1.word, 1) OVER (www) AS c2w , LEAD (c1.word, 2) OVER (www) AS c3w , LEAD ( c1.word, 3) OVER (www) AS c4w , LEAD (c1.lemma, 3) OVER (www) AS c4l , LEAD (c1.pos, 3) OVER (www) AS c4p , LEAD (c1.pos, 4) OVER (www) AS c5p , LEAD (c1.word, 4) OVER (www) AS c5w , LEAD (c1.word, 5) OVER (www) AS c6w , LEAD (c1.lemma, 5) OVER (www) AS c6l , LEAD (c1.word, 6) OVER (www) AS c7w , LEAD (c1.pos, 6) OVER (www) AS c7p , LEAD (c1.word, 7) OVER (www) AS c8w , LEAD (c1.word, 8) OVER (www) AS c9w , LEAD (c1.lemma, 8) OVER (www) AS c9l , LEAD (c1.pos, 8) OVER (www) AS c9p , LEAD (c1.word, 9) OVER (www) AS c10w , LEAD (c1.word, 10) OVER (www) AS c11w FROM orderedflatcorpus AS c1 WINDOW www AS (ORDER BY id) ) stuff WHERE 1=1 AND c4p LIKE 'v%' AND c5p = 'appge' AND c6l = 'way' AND c7p LIKE 'i%' AND c8w = 'the' AND c9p LIKE 'n%' ORDER BY id ) -- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header; TO '/tmp/OUTPUT2a.csv' DELIMITER E'\t' csv header;

[en lignende transformation kunne udføres på den anden forespørgsel]

OPDATERING 2 Underforespørgselsversionen for varianten med to tabeller.

-- copy( -- EXPLAIN ANALYZE SELECT c1i, c1s, c1w , c2w , c3w , c4w , c4l , c4p , c5w , c6w , c7w , c8w , c9w , c9l , c9p , c10w , c11w FROM ( SELECT c1.id AS c1i , c1.source AS c1s , w1.word AS c1w , LEAD (w1.word, 1) OVER www AS c2w , LEAD (w1.word, 2) OVER www AS c3w , LEAD (w1.word, 3) OVER www AS c4w , LEAD (w1.lemma, 3) OVER www AS c4l , LEAD (w1.pos, 3) OVER www AS c4p , LEAD (w1.pos, 4) OVER www AS c5p , LEAD (w1.word, 4) OVER www AS c5w , LEAD (w1.word, 5) OVER www AS c6w , LEAD (w1.lemma, 5) OVER www AS c6l , LEAD (w1.word, 6) OVER www AS c7w , LEAD (w1.pos, 6) OVER www AS c7p , LEAD (w1.word, 7) OVER www AS c8w , LEAD (w1.word, 8) OVER www AS c9w , LEAD (w1.lemma, 8) OVER www AS c9l , LEAD (w1.pos, 8) OVER www AS c9p , LEAD (w1.word, 9) OVER www AS c10w , LEAD (w1.word, 10) OVER www AS c11w FROM orderedflatcorpus c1 JOIN words w1 ON w1.id=c1.words_id WHERE 1=1 /* These *could* to prune out unmatched items, but I could not get it to work ... AND EXISTS (SELECT *FROM orderedflatcorpus c4 JOIN words w4 ON w4.id=c4.words_id WHERE c4.id = 3+c1.id -- AND w4.pos LIKE 'v%' ) -- OMG AND EXISTS (SELECT *FROM orderedflatcorpus c5 JOIN words w5 ON w5.id=c5.words_id WHERE c5.id = 4+c1.id -- AND w5.pos = 'appge' ) -- OMG AND EXISTS (SELECT *FROM orderedflatcorpus c7 JOIN words w7 ON w7.id=c7.words_id WHERE c7.id = 6+c1.id -- AND w7.pos LIKE 'i%' ) -- OMG AND EXISTS (SELECT *FROM orderedflatcorpus c9 JOIN words w9 ON w9.id=c9.words_id WHERE c9.id = 8+c1.id -- AND w9.pos LIKE 'n%' ) -- OMG AND EXISTS (SELECT *FROM orderedflatcorpus c8 JOIN words w8 ON w8.id=c8.words_id WHERE c8.id = 7+c1.id -- AND w8.word = 'the' ) -- OMG */ WINDOW www AS (ORDER BY c1.id ROWS BETWEEN CURRENT ROW AND 10 FOLLOWING) ) stuff WHERE 1=1 AND c4p LIKE 'v%' AND c5p = 'appge' AND c6l = 'way' AND c7p LIKE 'i%' AND c8w = 'the' AND c9p LIKE 'n%' ORDER BY c1i ; -- ) -- TO '/home/postgres/Results/OUTPUT.csv' DELIMITER E'\t' csv header; -- TO '/tmp/OUTPUT3b.csv' DELIMITER E'\t' csv header;

  1. Oracle Direct-load INSERTs gennem JDBC?

  2. Kopiér række og alle dens "børn"

  3. Brug af MySQL Data Reader

  4. Problemer med konfiguration af transaktionslog