[MarkLogic Dev General] Data profiling on large datasets

Geert Josten Geert.Josten at marklogic.com
Mon Mar 30 08:03:16 PDT 2015


And here some more code that could be useful to you. Not the pretties, and
wrapped in a REST extension, but maybe useful nonetheless.

https://gist.github.com/grtjn/1aba4eb364de9268fb5f


Cheers,
Geert

On 3/30/15, 3:38 PM, "Gary Vidal" <Gary.Vidal at marklogic.com> wrote:

>Alex,
>
>I hoisted this code from a project I wrote that does analysis and
>captures statistics.   The goal is to do recursive descent until all
>nodes are resolved by appending a query to negate visited root nodes,
>with a cut-off before tree-cache fills up.
>
>
>declare namespace a = "a:roots";
>declare variable $bcount := 0;
>declare variable $MAX-ITERATIONS := 200;
>declare variable $base-constraint := cts:and-query(());
>
>declare function a:get-root-elements($bdone,$qnames,$results) {
>   xdmp:set($bcount,$bcount + 1),
>   if($bdone or $bcount > $MAX-ITERATIONS) then (
>     xdmp:log(fn:concat("Starting Root
>Frequency:",xdmp:elapsed-time()),"debug"),
>     for $k in $results
>     let $parts := fn:analyze-string($k,"\{(.*)\}(.*)")
>     let $ns := fn:string($parts/*:match/*:group[@nr eq 1])
>     let $local-name := fn:string($parts/*:match/*:group[@nr eq 2])
>     let $frequency  :=
>        if($ns eq "")
>        then xdmp:eval(
>            fn:concat("declare variable $base-constraint
>external;xdmp:estimate(cts:search(/",$local-name,",($base-constraint),('un
>filtered')))"),
>            (fn:QName("","base-constraint"),$base-constraint)
>            ) 
>        else xdmp:eval(
>            fn:concat("declare namespace _1  = """,$ns,""";
>                       declare variable $base-constraint external;
>                  
>xdmp:estimate(cts:search(/_1:",$local-name,",$base-constraint))"),
>            (fn:QName("","base-constraint"),$base-constraint)
>            )
>     where $frequency > 0
>     return ((:
>      <root-element>
>        <type>element</type>
>        <database>{xdmp:database()}</database>
>        <id>{xdmp:md5($k)}</id>
>        <namespace>{$ns}</namespace>
>        <localname>{$local-name}</localname>
>        <frequency>{$frequency}</frequency>
>      </root-element>
>      :)
>       xdmp:key-from-QName(fn:QName($ns,$local-name))
>      ),
>     xdmp:log(fn:concat("Finished Root
>Frequency:",xdmp:elapsed-time()),"debug")
>)
>else 
>    let $constraint :=
>      if(fn:exists($qnames))
>      then for $qn in $qnames return
>cts:not-query(cts:element-query($qn,cts:and-query(())))
>      else ()
>    let $rnode := 
>        if(fn:not(fn:empty($qnames)))
>        then 
>fn:subsequence(cts:search(/element(),cts:and-query(($base-constraint,$cons
>traint)),"unfiltered"),1,1)
>        else 
>fn:subsequence(cts:search(/element(),cts:and-query(()),"unfiltered"),1,1)
>    return
>        if($rnode instance of element() and fn:not(fn:node-name($rnode) =
>$qnames)) 
>        then
>            let $qname := fn:node-name($rnode)
>            let $key :=
>fn:concat("{",fn:namespace-uri($rnode),"}",fn:local-name($rnode))
>            return (
>                
>a:get-root-elements(fn:false(),($qnames,$qname),($key,$results))
>            )
>        else if(fn:node-name($rnode) = $qnames) then
>            a:get-root-elements(fn:true(),$qnames,$results)
>        else if(fn:empty($rnode)) then
>            a:get-root-elements(fn:true(),$qnames,$results)
>        else 
>           a:get-root-elements(fn:false(),$qnames,$results)
>};
>a:get-root-elements(fn:false(),(),())
>_______________________________________________
>General mailing list
>General at developer.marklogic.com
>http://developer.marklogic.com/mailman/listinfo/general



More information about the General mailing list