[MarkLogic Dev General] Data profiling on large datasets

Joe Bryan Joe.Bryan at marklogic.com
Mon Mar 30 09:10:43 PDT 2015


Hi Alex,

Here's a library of experimental query types:
https://github.com/joemfb/cts-extensions. It includes a function called
ctx:root-QNames(), which lists root elements as xs:QName's. (The
implementation is conceptually similar to Gary's). That function is
parameterized, so you can restrict this list to root elements from
fragments matching arbitrary queries.

Thanks.

-jb

On 3/30/15, 11:03 AM, "Geert Josten" <Geert.Josten at marklogic.com> wrote:

>And here some more code that could be useful to you. Not the pretties, and
>wrapped in a REST extension, but maybe useful nonetheless.
>
>https://gist.github.com/grtjn/1aba4eb364de9268fb5f
>
>
>Cheers,
>Geert
>
>On 3/30/15, 3:38 PM, "Gary Vidal" <Gary.Vidal at marklogic.com> wrote:
>
>>Alex,
>>
>>I hoisted this code from a project I wrote that does analysis and
>>captures statistics.   The goal is to do recursive descent until all
>>nodes are resolved by appending a query to negate visited root nodes,
>>with a cut-off before tree-cache fills up.
>>
>>
>>declare namespace a = "a:roots";
>>declare variable $bcount := 0;
>>declare variable $MAX-ITERATIONS := 200;
>>declare variable $base-constraint := cts:and-query(());
>>
>>declare function a:get-root-elements($bdone,$qnames,$results) {
>>   xdmp:set($bcount,$bcount + 1),
>>   if($bdone or $bcount > $MAX-ITERATIONS) then (
>>     xdmp:log(fn:concat("Starting Root
>>Frequency:",xdmp:elapsed-time()),"debug"),
>>     for $k in $results
>>     let $parts := fn:analyze-string($k,"\{(.*)\}(.*)")
>>     let $ns := fn:string($parts/*:match/*:group[@nr eq 1])
>>     let $local-name := fn:string($parts/*:match/*:group[@nr eq 2])
>>     let $frequency  :=
>>        if($ns eq "")
>>        then xdmp:eval(
>>            fn:concat("declare variable $base-constraint
>>external;xdmp:estimate(cts:search(/",$local-name,",($base-constraint),('u
>>n
>>filtered')))"),
>>            (fn:QName("","base-constraint"),$base-constraint)
>>            ) 
>>        else xdmp:eval(
>>            fn:concat("declare namespace _1  = """,$ns,""";
>>                       declare variable $base-constraint external;
>>                 
>>xdmp:estimate(cts:search(/_1:",$local-name,",$base-constraint))"),
>>            (fn:QName("","base-constraint"),$base-constraint)
>>            )
>>     where $frequency > 0
>>     return ((:
>>      <root-element>
>>        <type>element</type>
>>        <database>{xdmp:database()}</database>
>>        <id>{xdmp:md5($k)}</id>
>>        <namespace>{$ns}</namespace>
>>        <localname>{$local-name}</localname>
>>        <frequency>{$frequency}</frequency>
>>      </root-element>
>>      :)
>>       xdmp:key-from-QName(fn:QName($ns,$local-name))
>>      ),
>>     xdmp:log(fn:concat("Finished Root
>>Frequency:",xdmp:elapsed-time()),"debug")
>>)
>>else 
>>    let $constraint :=
>>      if(fn:exists($qnames))
>>      then for $qn in $qnames return
>>cts:not-query(cts:element-query($qn,cts:and-query(())))
>>      else ()
>>    let $rnode :=
>>        if(fn:not(fn:empty($qnames)))
>>        then 
>>fn:subsequence(cts:search(/element(),cts:and-query(($base-constraint,$con
>>s
>>traint)),"unfiltered"),1,1)
>>        else 
>>fn:subsequence(cts:search(/element(),cts:and-query(()),"unfiltered"),1,1)
>>    return
>>        if($rnode instance of element() and fn:not(fn:node-name($rnode) =
>>$qnames)) 
>>        then
>>            let $qname := fn:node-name($rnode)
>>            let $key :=
>>fn:concat("{",fn:namespace-uri($rnode),"}",fn:local-name($rnode))
>>            return (
>>                
>>a:get-root-elements(fn:false(),($qnames,$qname),($key,$results))
>>            )
>>        else if(fn:node-name($rnode) = $qnames) then
>>            a:get-root-elements(fn:true(),$qnames,$results)
>>        else if(fn:empty($rnode)) then
>>            a:get-root-elements(fn:true(),$qnames,$results)
>>        else 
>>           a:get-root-elements(fn:false(),$qnames,$results)
>>};
>>a:get-root-elements(fn:false(),(),())
>>_______________________________________________
>>General mailing list
>>General at developer.marklogic.com
>>http://developer.marklogic.com/mailman/listinfo/general
>
>_______________________________________________
>General mailing list
>General at developer.marklogic.com
>http://developer.marklogic.com/mailman/listinfo/general



More information about the General mailing list