On 31.03.2020 18:32, Ben Engbers wrote:
Hi,
For (my personal) clarity, I have split up the original function in two parts:
declare function local:step_one($nodes as node()*) as array(*)* { let $text := for $node in $nodes return $node/text() => tokenize() => distinct-values() let $idf := $text => tidyTM:wordCount_arr() return $idf };
In local:step_one(), I first create a sequence with the distinct tokens for each $node. All the sequences are joined in $text. I then call wordCount_arr to count the occurences of each word in $text:
declare function tidyTM:wordCount_arr( $Words as xs:string*) as array(*) { for $w in $Words let $f := $w group by $f order by count($w) descending return ([$f, count($w)]) } ;
I would say that tidyTM:wordCount_arr returns a sequence of arrays but I am not certain if I have specified the correct return-type?
Reading the code I agree that the return type seems to be a sequence of arrays but therefore I wonder why you don't get a similar error as later on with declaring array(*) and not array(*)*
Calling local:step_one(tidyTM:remove_Stopwords($nodes, "Stp", $Stoppers)) returns: ["probleem", 703] ["opgelost.", 248] ....
I had hoped that calling the following local:wordFreq, would add the idf to each element but instead I get an error
declare function local:wordFreq_idf($nodes as node()*) as array(*) { let $count := count($nodes) let $idf := local:step_one($nodes) let $result := for-each( $idf, function($z) {array:append ($z, math:log($count div $z(2) ) ) } ) return $result }; [XPTY0004] Cannot promote (array(xs:anyAtomicType))+ to array(*): $idf := ([ "probleem", 703 ], [ "opgelost.", 248 ], ...).
The message tries to tell you that the declared return type array(*) is a single array while the function returns a (non-empty) sequence of arrays so using declare function local:wordFreq_idf($nodes as node()*) as array(*)* would remove that error.
To insert the third value into each array I think you want
let $result := $idf ! array:append(., math:log($count div .(2) ))
Hi,
To insert the third value into each array I think you want
let $result := $idf ! array:append(., math:log($count div .(2) ))
This works!
Martin and Graydon, thanks for the help and the explanation.
Ben
import module namespace tidyTM = 'http://www.be-logical.nl';
declare function local:step_one($nodes as node()*) as array(*)* { let $text := for $node in $nodes return $node/text() => tokenize() => distinct-values() let $idf := $text => tidyTM:wordCount_arr() return $idf };
declare function local:wordFreq_idf($nodes as node()*) as array(*) { let $count := count($nodes) let $idf := local:step_one($nodes) let $result := $idf ! array:append(., math:log($count div .(2) )) return $result };
let $nodes := collection('IncidentRemarks/Incidenten-180101-190630.csv')/csv/record/INC_RM let $Stoppers := doc('TextMining/Stopwoorden.txt')/text/line/text()
return local:wordFreq_idf( tidyTM:remove_Stopwords($nodes, "Stp", $Stoppers))
--------------
declare function tidyTM:wordCount_arr( $Words as xs:string*) as array(*)* { for $w in $Words let $f := $w group by $f order by count($w) descending return ([$f, count($w)]) } ;
-----------
["probleem", 703, 9.362885817944681e-1] ["opgelost.", 248, 1.9782167274401508e0] ...
basex-talk@mailman.uni-konstanz.de