And I discovered a little bug, just after I have dispatched the message: The reversed nodes will be rewritten to document order in the $id-string function. You can circumvent this by utilizing the simple map operator:

Old: $nodes/(@ID, @xml:id)
New: $nodes!(@ID, @xml:id)



On Tue, Nov 12, 2019 at 6:48 PM Christian Grün <christian.gruen@gmail.com> wrote:
Dear Omar,

Some spontaneous ideas:

• You could try to evaluate redundant expressions once and bind them to a variable instead (see the attached code).
• You could save each document to a separate database via db:create (depending on your data, this may be faster than replacements in a single database), or save all new elements in a single document.
• Instead of creating full index structures with each update operation, you may save a lot of time if you only update parts of the data that have actually changed.
• If that’s close to impossible (because the types of updates are too manifold), you could work with daily databases that only contain incremental changes, and merge them with the main database every night.

2,4 million tags are a lot, though; and the string length of the created attribute values seem to exceed 100.000 characters, which is a lot, too. What will you do with the resulting documents?

Best,
Christian



let $id-string := function($nodes) {
  $nodes/(@ID, @xml:id)
  => subsequence(1, 10000)
  => string-join(' ')
}

let $db := '_qdb-TEI-02__cache'
let $nodes := db:open($db)/_:dryed[@order = 'none']/_:d

let $vutlsk := sort($nodes, (), function($n) { $n/@vutlsk })
let $archiv := sort($nodes, (), function($n) { $n/@vutlsk-archiv })

return (
  db:replace($db, 'ascending_cache.xml',
    <_:dryed order="ascending" ids="{ $id-string($vutlsk) }"/>),
  db:replace($db, 'descending_cache.xml',
    <_:dryed order="descending" ids="{ $id-string(reverse($vutlsk)) }"/>),
  db:replace($db, 'ascending-archiv_cache.xml',
    <_:dryed order="ascending" ids="{ $id-string($archiv) }" label="archiv"/>),
  db:replace($db, 'descending-archiv_cache.xml',
    <_:dryed order="descending" ids="{ $id-string(reverse($archiv)) }" label="archiv"/>)
)
____________________________

On Tue, Nov 12, 2019 at 6:00 PM Omar Siam <Omar.Siam@oeaw.ac.at> wrote:
Hi,

I have a custom index that looks like this (one db, different files):

<_:dryed xmlns:_="https://www.oeaw.ac.at/acdh/tools/vle/util"
db_name="z881_qdb-TEI-02n" order="none">
   <_:d pre="15627" db_name="z881_qdb-TEI-02n" xml:id="z881_qdbn-d16e2"
vutlsk="tsįttr Ziter [Subst]" vutlsk-archiv="HK 881, z8810118.sch#1"/>
   <_:d pre="15673" db_name="z881_qdb-TEI-02n" xml:id="z881_qdbn-d16e21"
vutlsk="tsįttr Ziter [Subst]" vutlsk-archiv="HK 881, z8810118.sch#1"/>
...
</_:dryed>
<_:dryed xmlns:_="https://www.oeaw.ac.at/acdh/tools/vle/util"
db_name="f227_qdb-TEI-02n" order="none">
   <_:d pre="467" db_name="f227_qdb-TEI-02n" xml:id="f237_qdb-d1e29398"
vutlsk="(aus)faren [Verb]" vutlsk-archiv="HK 327, f227#944.1 =
fare0126.eck#1.1"/>
   <_:d pre="591" db_name="f227_qdb-TEI-02n" xml:id="f237_qdb-d1e29438"
vutlsk="(aus)faren [Verb]" vutlsk-archiv="HK 327, f227#945.1 =
fare0126.eck#2.1"/>
...
</_:dryed>

There are about 2.4 Mio _:d tags in this db.

I need to sort them by the @vutlsk* attributes alphabetically in
ascending and descending order.

With the code I have now:

declare namespace _ = "https://www.oeaw.ac.at/acdh/tools/vle/util";

let $sorted-ascending := subsequence(for $d in
collection('_qdb-TEI-02__cache')//*[@order="none"]/_:d
   order by $d/@vutlsk ascending
   return $d/(@ID, @xml:id)/data(), 1, 10000)
let $sorted-descending := subsequence(for $d in
collection('_qdb-TEI-02__cache')//*[@order="none"]/_:d
   order by $d/@vutlsk descending
   return $d/(@ID, @xml:id)/data(), 1, 10000)
let $sorted-ascending-archiv := subsequence(for $d in
collection('_qdb-TEI-02__cache')//*[@order="none"]/_:d
   order by $d/@vutlsk-archiv ascending
   return $d/(@ID, @xml:id)/data(), 1, 10000)
let $sorted-descending-archiv := subsequence(for $d in
collection('_qdb-TEI-02__cache')//*[@order="none"]/_:d
   order by $d/@vutlsk-archiv descending
   return $d/(@ID, @xml:id)/data(), 1, 10000)
return (db:replace("_qdb-TEI-02__cache", 'ascending_cache.xml', <_:dryed
order="ascending" ids="{string-join($sorted-ascending, ' ')}"/>),
db:replace("_qdb-TEI-02__cache", 'descending_cache.xml', <_:dryed
order="descending" ids="{string-join($sorted-descending, ' ')}"/>),
db:replace("_qdb-TEI-02__cache", 'ascending-archiv_cache.xml', <_:dryed
order="ascending" label="archiv"
ids="{string-join($sorted-ascending-archiv, ' ')}"/>),
db:replace("_qdb-TEI-02__cache", 'descending-archiv_cache.xml', <_:dryed
order="descending" label="archiv"
ids="{string-join($sorted-descending-archiv, ' ')}"/>))

This takes 30 s to about a minute depending on the subsequence I choose.

I did experiments with doing multithreading and not. Multiple jobs or
fork-join make it worse.

Worst case I need to do it every time I save a change to the original
DBs for which I maintain that index.

Any ideas how to speed this up?

Best regards

Omar Siam