XML : FLWOR XQuery to calculate the probability between words

I am making a file containing a FLWOR XQuery to return all the occurrences of the target word 'we' in the xml file, together with the word which comes next in the sentence in each case. I want to calculate the probability as the ratio: (number of times successor word appears after target word 'we') / (number of times successor word appears overall).

Here is the XML File I am working on:

  <u who="PS6H7">  <s n="3">      <w c5="AV0" hw="well" pos="ADV">Well</w>      <c c5="PUN">, </c>      <w c5="AJ0" hw="good" pos="ADJ">good </w>      <w c5="NN1" hw="afternoon" pos="SUBST">afternoon</w>      <c c5="PUN">, </c>      <w c5="PNI" hw="everybody" pos="PRON">everybody</w>      <c c5="PUN">, </c>      <w c5="PNP" hw="i" pos="PRON">I </w>      <w c5="VVB" hw="think" pos="VERB">think </w>      <w c5="PNP" hw="we" pos="PRON">we</w>      <w c5="VHD" hw="have" pos="VERB">'d </w>      <w c5="AV0" hw="well" pos="ADV">better </w>      <w c5="VVI" hw="get" pos="VERB">get </w>      <w c5="VVN" hw="start" pos="VERB">started</w>      <c c5="PUN">.</c>  </s>    <s n="4">      <w c5="PNP" hw="we" pos="PRON">We </w>      <w c5="VVD" hw="look" pos="VERB">looked </w>      <w c5="AV0" hw="so" pos="ADV">so </w>      <w c5="AJ0" hw="thin" pos="ADJ">thin </w>      <w c5="PRP" hw="on" pos="PREP">on </w>      <w c5="AT0" hw="the" pos="ART">the </w>      <w c5="NN1" hw="ground" pos="SUBST">ground</w>      <c c5="PUN">, </c>      <w c5="PNP" hw="i" pos="PRON">I </w>      <w c5="VVD" hw="think" pos="VERB">thought </w>      <w c5="PNP" hw="we" pos="PRON">we</w>      <w c5="VM0" hw="would" pos="VERB">'d </w>      <w c5="VVI" hw="sit" pos="VERB">sit </w>      <w c5="CJC" hw="and" pos="CONJ">and </w>      <w c5="VVI" hw="wait" pos="VERB">wait </w>      <w c5="CJC" hw="and" pos="CONJ">and </w>      <w c5="VVI" hw="see" pos="VERB">see </w>      <w c5="CJS" hw="if" pos="CONJ">if </w>      <w c5="PNI" hw="everyone" pos="PRON">everyone</w>      <w c5="VBZ" hw="be" pos="VERB">'s </w>      <w c5="VVG-AJ0" hw="come" pos="VERB">coming</w>      <c c5="PUN">, </c>      <w c5="CJC" hw="but" pos="CONJ">but </w>      <w c5="UNC" hw="erm" pos="UNC">erm </w>      <w c5="PNP" hw="we" pos="PRON">we</w>      <w c5="VM0" hw="will" pos="VERB">'ll </w>      <w c5="VHI" hw="have" pos="VERB">have </w>      <w c5="TO0" hw="to" pos="PREP">to </w>      <w c5="VVI" hw="get" pos="VERB">get </w>      <w c5="VVN" hw="start" pos="VERB">started </w>      <w c5="AV0" hw="anyway" pos="ADV">anyway</w>      <c c5="PUN">.</c>  </s>    <s n="5">      <w c5="PNP" hw="we" pos="PRON">We</w>      <w c5="VM0" hw="will" pos="VERB">'ll </w>      <w c5="VVI" hw="welcome" pos="VERB">welcome</w>      <c c5="PUN">, </c>      <w c5="PNP" hw="we" pos="PRON">we </w>      <w c5="VHB" hw="have" pos="VERB">have </w>      <w c5="CRD" hw="two" pos="ADJ">two </w>      <w c5="NN2" hw="speaker" pos="SUBST">speakers</w>      <c c5="PUN">, </c>      <w c5="NP0" hw="mr" pos="SUBST">Mr </w>      <w c5="NP0" hw="bob" pos="SUBST">Bob </w>      <w c5="NP0" hw="plumtree" pos="SUBST">Plumtree</w>      <c c5="PUN">, </c>      <w c5="CJC" hw="and" pos="CONJ">and </w>      <w c5="NP0" hw="ms" pos="SUBST">Ms </w>      <w c5="NP0" hw="erica" pos="SUBST">Erica </w>      <w c5="NP0" hw="ison" pos="SUBST">Ison</w>      <c c5="PUN">.</c>  </s>    <s n="6">      <w c5="PNP" hw="we" pos="PRON">We </w>      <w c5="VVD" hw="ask" pos="VERB">asked </w>      <w c5="PNP" hw="they" pos="PRON">them </w>      <w c5="PRP" hw="to" pos="PREP">to </w>      <w c5="AT0" hw="the" pos="ART">the </w>      <w c5="NN1" hw="meeting" pos="SUBST">meeting </w>      <w c5="CJC" hw="and" pos="CONJ">and </w>      <w c5="PNP" hw="we" pos="PRON">we </w>      <w c5="VVB" hw="look" pos="VERB">look </w>      <w c5="AV0" hw="forward" pos="ADV">forward </w>      <w c5="PRP" hw="to" pos="PREP">to </w>      <w c5="VVG-NN1" hw="listen" pos="VERB">listening </w>      <w c5="PRP" hw="to" pos="PREP">to </w>      <w c5="PNP" hw="you" pos="PRON">you </w>      <w c5="AV0" hw="later" pos="ADV">later </w>      <w c5="AVP" hw="on" pos="ADV">on </w>      <w c5="PRP" hw="in" pos="PREP">in </w>      <w c5="AT0" hw="the" pos="ART">the </w>      <w c5="NN1" hw="agenda" pos="SUBST">agenda</w>      <c c5="PUN">.</c>  </s>    <s n="7">      <w c5="AT0" hw="the" pos="ART">The </w>      <w c5="NN2" hw="minute" pos="SUBST">minutes </w>      <w c5="PRF" hw="of" pos="PREP">of </w>      <w c5="AT0" hw="the" pos="ART">the </w>      <w c5="NN1" hw="meeting" pos="SUBST">meeting </w>      <w c5="VVD-VVN" hw="hold" pos="VERB">held </w>      <w c5="PRP" hw="in" pos="PREP">in </w>      <w c5="NP0" hw="january" pos="SUBST">January</w>      <c c5="PUN">.</c>  </s>    <s n="8">      <w c5="DT0" hw="any" pos="ADJ">Any </w>      <w c5="NN2" hw="correction" pos="SUBST">corrections </w>      <w c5="PRP" hw="to" pos="PREP">to </w>      <w c5="AT0" hw="the" pos="ART">the </w>      <w c5="NN2" hw="minute" pos="SUBST">minutes </w>      <w c5="ORD" hw="first" pos="ADJ">first</w>      <c c5="PUN">?</c>  </s>    </u>     

This is my FLWOR XQuery file. It returns all the occurrences of the target word 'we, together with the word that comes after it. I am also able to find the frequency (number of times the successor word occurs after target word), but I cannot calculate the probability ratio. The formula to find probability is (number of times successor word appears after target word 'we') / (number of times successor word appears overall).

In result, I want to an HTML table to show the target word 'we' in 1st column, the word that occurs after 'we' in 2nd column and the frequency or number of times the combination occurred in 3rd column, and the probability in the 4th column.

  <html>  <body>  <table border='1'>  <tr><td>Target</td><td>Successor</td><td>Frequency</td><td>Probability</td></tr>    {    let $s := "has"    let $x := doc("KS0.xml")//u//s//w[lower-case(normalize-space()) = $s]    for $w in distinct-values($x/following-sibling::w[1])    let $e := count(doc("KS0.xml")//u//s//w)    let $g := $x/following-sibling::w[1][. = $w]    order by count($g) descending    return <tr>             <td>{$s}</td>             <td>{$w}</td>             <td>{count($g)}</td>             <td>{$e}</td>         </tr>  }    </table>  </body>  </html>    

No comments:

Post a Comment