[MarkLogic Dev General] Data profiling on large datasets

Gary Vidal Gary.Vidal at marklogic.com
Mon Mar 30 06:38:41 PDT 2015


Alex,

I hoisted this code from a project I wrote that does analysis and captures statistics.   The goal is to do recursive descent until all nodes are resolved by appending a query to negate visited root nodes, with a cut-off before tree-cache fills up.  


declare namespace a = "a:roots";
declare variable $bcount := 0;
declare variable $MAX-ITERATIONS := 200;
declare variable $base-constraint := cts:and-query(());

declare function a:get-root-elements($bdone,$qnames,$results) {
   xdmp:set($bcount,$bcount + 1),
   if($bdone or $bcount > $MAX-ITERATIONS) then (
     xdmp:log(fn:concat("Starting Root Frequency:",xdmp:elapsed-time()),"debug"),
     for $k in $results
     let $parts := fn:analyze-string($k,"\{(.*)\}(.*)")
     let $ns := fn:string($parts/*:match/*:group[@nr eq 1])
     let $local-name := fn:string($parts/*:match/*:group[@nr eq 2])
     let $frequency  :=
        if($ns eq "") 
        then xdmp:eval(
            fn:concat("declare variable $base-constraint external;xdmp:estimate(cts:search(/",$local-name,",($base-constraint),('unfiltered')))"),
            (fn:QName("","base-constraint"),$base-constraint)        
            ) 
        else xdmp:eval(
            fn:concat("declare namespace _1  = """,$ns,""";
                       declare variable $base-constraint external;
                       xdmp:estimate(cts:search(/_1:",$local-name,",$base-constraint))"),
            (fn:QName("","base-constraint"),$base-constraint)
            )
     where $frequency > 0
     return ((:
      <root-element>
        <type>element</type>
        <database>{xdmp:database()}</database>
        <id>{xdmp:md5($k)}</id>
        <namespace>{$ns}</namespace>
        <localname>{$local-name}</localname>
        <frequency>{$frequency}</frequency>
      </root-element>
      :)
       xdmp:key-from-QName(fn:QName($ns,$local-name))
      ),
     xdmp:log(fn:concat("Finished Root Frequency:",xdmp:elapsed-time()),"debug")
)
else 
    let $constraint := 
      if(fn:exists($qnames))  
      then for $qn in $qnames return cts:not-query(cts:element-query($qn,cts:and-query(())))
      else ()
    let $rnode :=       
        if(fn:not(fn:empty($qnames))) 
        then fn:subsequence(cts:search(/element(),cts:and-query(($base-constraint,$constraint)),"unfiltered"),1,1)
        else fn:subsequence(cts:search(/element(),cts:and-query(()),"unfiltered"),1,1)
    return
        if($rnode instance of element() and fn:not(fn:node-name($rnode) = $qnames)) 
        then
            let $qname := fn:node-name($rnode)
            let $key := fn:concat("{",fn:namespace-uri($rnode),"}",fn:local-name($rnode))
            return (
                a:get-root-elements(fn:false(),($qnames,$qname),($key,$results))
            )
        else if(fn:node-name($rnode) = $qnames) then 
            a:get-root-elements(fn:true(),$qnames,$results)
        else if(fn:empty($rnode)) then
            a:get-root-elements(fn:true(),$qnames,$results)
        else 
           a:get-root-elements(fn:false(),$qnames,$results)
};
a:get-root-elements(fn:false(),(),())


More information about the General mailing list