#!/bin/bash mkdir tmp # TODO: work on the full-text part in PostgreSQL # a trigger will be required to make it work. # # DONE: more detailed content extraction. cat << EOF > tmp/archive-schema.sql CREATE DATABASE "archive"; \c "archive" CREATE EXTENSION IF NOT EXISTS btree_gist; DROP TABLE IF EXISTS thread2; CREATE TABLE IF NOT EXISTS thread2 ( id SERIAL PRIMARY KEY, date VARCHAR(300), author VARCHAR(300), post_id VARCHAR(300), doc_uri VARCHAR(300), content TEXT, basex_db VARCHAR(300) ); CREATE INDEX idx_thread2_date ON thread2(date); CREATE INDEX idx_thread2_author ON thread2(author); CREATE INDEX idx_thread2_post_id ON thread2(post_id); CREATE INDEX idx_thread2_doc_uri ON thread2(doc_uri); CREATE INDEX idx_thread2_basex_db ON thread2(basex_db); CREATE UNIQUE INDEX idx_thread2_doc_uri ON thread2(doc_uri); EOF LD_LIBRARY_PATH="" /share/Public/builds/prefix/bin/psql -U postgres -d postgres < tmp/archive-schema.sql cat << 'EOF' > tmp/import.xq let $conn-string:="jdbc:postgresql://localhost:5432/archive?user=postgres&password=postgres" let $pgconn := sql:connect($conn-string) let $dbs:=fn:filter(db:list(), function($x){ matches($x,"linuxquestions-shard-") }) for $db in fn:reverse(fn:sort($dbs)) for $x in db:open($db) let $doc_uri:=$x/fn:base-uri() let $elems:=$x//table[matches(@id,"post\d+")] for $elem in $elems let $date:=$elem/tr[1]/td[1]/text() let $author:=$elem/tr[2]/td[1]/div[1]/a//text() let $post_id:=$elem/@id/string() let $content:=$elem//*[matches(@id,"post_message_")]/text() let $params := { $date } { $author } { $post_id } { $doc_uri } { $content } { $db } let $prep:=sql:prepare($pgconn, "INSERT INTO thread2(date,author,post_id,doc_uri,content,basex_db) VALUES(?,?,?,?,?,?)") return try { sql:execute-prepared($prep,$params) } catch * { 'Error [' || $err:code || ']: ' || $err:description || '--' || $params } EOF /share/Public/builds/basex/bin/basexclient -U admin -P admin tmp/import.xq