|
From: Lars S. <sch...@sa...> - 2024-09-02 15:21:12
|
Hey, we assume that we are not using the index in our project as intended. Because when we try to build the index we have created, it takes a very long time. We have two collections, one with a data stack of 687 in which data is stored, and one with 400 xml where articles are are stored. For the personal article we want certain information from the articles and vice versa. Person XML: <person xml:id="i0c9ab7e2-2e21-39ff-aea8-c56ad4702a7f" status="safe" modified="2024-07-30T13:26:09.154+02:00"> <name>Marcanton Zimara</name> <identifier preferred="YES">https://d-nb.info/gnd/120156784</identifier> <alternateName>Marcusantonius Zimara</alternateName> <alternateName>Marcus Anthonius Zimara</alternateName> <alternateName>Antonius Zimara</alternateName> <alternateName>M. Antonius Zimarra</alternateName> <alternateName>Marc Antoine Zimara</alternateName> <alternateName>M. Anto. Zimare</alternateName> <alternateName>Marco A. Zimara</alternateName> <alternateName>Marcus A. Zimara</alternateName> <alternateName>Marc Ant. Zimara</alternateName> <alternateName>Marcantonio Zimara</alternateName> <alternateName>Marcus Antonius Zimara</alternateName> <alternateName>Marcus Antonius Zimarra</alternateName> <alternateName>Marcianto Zimare</alternateName> <alternateName>Marco Antonio Zimarra</alternateName> <alternateName>Marco Antonio Zimare</alternateName> <birthDate>1460</birthDate> <deathDate>1532</deathDate> <description>JWO</description> <sortableName>Zimara, Marcanton </sortableName> </person> Articel XML: <?xml version="1.0" encoding="UTF-8"?> <TEI xmlns="http://www.tei-c.org/ns/1.0"> <teiHeader> <fileDesc> <titleStmt> <title>a nihilo nihil fit</title> <author> <persName ref="/db/projects/jwo/data/lists/personenListe.xml#BS_d1e509" xml:id="author_BS_d1e509"> <forename>Marcanton</forename> <surname>Zimara</surname> </persName> </author> </titleStmt> <sourceDesc> <p xml:id="p_sourceDesc_igw_tvr_pzb">born digital</p> </sourceDesc> </fileDesc> </teiHeader> <text xml:lang="de-DE" type="main"> <body> <div1 xml:id="div1_d1e23_2"> <p xml:id="p_d1e27_1" n="1"> Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation <persName xml:id="persName_sa123" ref="https://d-nb.info/gnd/120156784" rend="smallcaps">Zimara</persName> ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu <persName xml:id="persName_s123" ref="https://d-nb.info/gnd/120156784">Zimara</persName> fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p> </div1> </body> </text> </TEI> Collection.xconf: <collection xmlns="http://exist-db.org/collection-config/1.0"> <index xmlns:gndo="https://d-nb.info/standards/elementset/gnd#" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:xs="http://www.w3.org/2001/XMLSchema"> <lucene> <module uri="http://place.sok.org/xquery/index-persons" prefix="ip" at="xmldb:exist:///db/apps/sok-application/modules/index-persons.xqm"/> <analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/> <analyzer class="org.exist.indexing.lucene.analyzers.NoDiacriticsStandardAnalyzer" id="nodiacritics"/> <text qname="adcache"> <field name="basicId" expression="//person/@id/string()"/> <field name="fullname" expression="string(./basic/person/name)"/> <field name="gndURI" expression="string(./basic/person/identifier[@preferred eq 'YES'])"/> <field name="gndID" expression="substring-after(./basic/person/identifier[@preferred eq 'YES']/string(), '/gnd/')"/> <field name="status" expression="./basic/person/@status/string()"/> <field name="articleID" expression="ip:getArticleFromPersonCache(.)"/> <field name="articleRole" expression="ip:getArticleRoleFromPersonCache(.)"/> <field name="fulltext" expression="ip:getFullText(.)"/> </text> </lucene> </index> </collection> Module Functions: module namespace ip = "http://place.sok.org/xquery/index-persons"; declare namespace basic = "http://place.sok.org/xquery/basic" ; declare namespace xs = "http://www.w3.org/2001/XMLSchema"; declare namespace tei = "http://www.tei-c.org/ns/1.0"; declare namespace util = "http://exist-db.org/xquery/util"; declare function ip:getArticleFromPersonCache($adcache as element()) as xs:string* { let $parentCollectionPath as xs:anyURI? := ip:getParentCollection($adcache), $basicId as xs:string := $adcache/basic/person/@id/string(), $identifier as xs:string? := $adcache/basic/person/identifier[@preferred eq 'YES']/string(), $listId as xs:string? := collection( $variables:jwo-lists-path )/tei:TEI//tei:person[ basic:basic-id-from-url(string(@sameAs)) eq $basicId]/@xml:id/string(), $foundInDocumentIds as xs:string* := if ( matches($parentCollectionPath,'prepublish') ) then ( collection($parentCollectionPath)/tei:TEI[./tei:teiHeader//tei:idno[1]/string() ne ''][matches(replace((normalize-space('||'||string-join(distinct-values(.//tei:persName[@ref]/@ref/string()) ! replace(.,'.*?#',''), '||')||'||')||normalize-space('||'||string-join(distinct-values(.//tei:persName[@source]/@source/string()) ! replace(.,'.*?#',''), '||')||'||')),'\|{4}',''),'\|{2}('||$basicId||'|'||$listId||'|'||$identifier||')\|{2}')]//tei:idno/string() ) else ( collection($parentCollectionPath)/tei:TEI[./tei:teiHeader//tei:idno[1]/string() ne ''][matches(replace((normalize-space('||'||string-join(distinct-values(.//tei:persName[@ref][not(parent::editor)]/@ref/string()) ! replace(.,'.*?#',''), '||')||'||')||normalize-space('||'||string-join(distinct-values(.//tei:persName[@source][not(parent::editor)]/@source/string()) ! replace(.,'.*?#',''), '||')||'||')),'\|{4}',''),'\|{2}('||$basicId||'|'||$listId||'|'||$identifier||')\|{2}')]//tei:idno/string() ) return ( $foundInDocumentIds ) }; declare function ip:getAuthenticatedArticleCollection($collection-name as xs:string) as item()* { if ($collection-name eq 'prepublish') then xmldb:xcollection($variables:jwo-prepublish-path) else xmldb:xcollection($variables:jwo-publish-path) }; declare function ip:getPersNamesInCollectionFromCachedPerson($cached-person as element(), $collection-name as xs:string) as element()* { let $basicId := $cached-person/basic/person/@id/string() let $identifier := $cached-person/basic/person/identifier[@preferred eq 'YES']/string() let $listId := collection( $variables:jwo-lists-path )/tei:TEI/tei:text[1]/tei:body[1]/tei:listPerson[1]/tei:person[ basic:basic-id-from-url(string(@sameAs)) eq $basicId]/@xml:id/string() let $collection := ip:getAuthenticatedArticleCollection($collection-name) return ( $collection//tei:persName[ string(@ref) eq $identifier or ip:getIdFromUri(string(@ref)) eq $listId or substring-after(string(@ref), '#') eq $basicId or substring-before(substring-after(string(@source), 'persons/'), '.xml') eq $basicId] ) }; declare function ip:getRoleFromPersName($persName as element(), $collection-name as xs:string) as xs:string? { if ($persName/ancestor::*/local-name() = 'author') then ( 'author' ) else if ($persName/ancestor::*/local-name() = 'editor') then ( if ($collection-name eq 'prepublish') then ( 'editor' ) (: Ignore editors in published case :) else () ) else ( 'annotated' ) }; declare function ip:getArticleRoleFromPersonCache($cached-person as element(), $collection-name as xs:string) as xs:string* { let $allPersNames := if ($collection-name ne 'prepublish') then ( ip:getPersNamesInCollectionFromCachedPerson($cached-person, $collection-name)[not(ancestor::*/local-name() = 'editor')] ) else ( ip:getPersNamesInCollectionFromCachedPerson($cached-person, $collection-name) ) return ( for $articleGroup in $allPersNames let $articleID := $articleGroup/ancestor::tei:TEI//tei:idno[1] group by $articleID return ( $articleID || '@@' || string-join(distinct-values( for $persName in $articleGroup let $role := ip:getRoleFromPersName($persName, $collection-name) order by $role return $role ), ' ') ) ) }; declare function ip:getParentCollection($element as node()) as xs:anyURI? { resolve-uri('../../', $element/base-uri()) }; declare function ip:getIdFromUri($uri as xs:string) as xs:string { substring-after($uri, '#') }; declare function basic:basic-id-from-url($url as xs:string) as xs:string? { substring-after(substring-before($url, '?dataset'),'persons/') }; declare function ip:getFullText($element) as xs:string { let $parentCollection as xs:anyURI? := ip:getParentCollection($element) return ( normalize-space(string-join( let $basicId as xs:string := $element/basic/person/@id/string(), $identifier as xs:string* := $element/basic/person/identifier[@preferred eq 'YES']/string(), $listId as item()* := collection( $variables:lists-path )/tei:TEI/tei:text[1]/tei:body[1]/tei:listPerson[1]/tei:person[ basic:basic-id-from-url(string(@sameAs)) eq $basicId]/@xml:id/string(), $element-string as xs:string* := string($element), $collections as item()* := collection($parentCollection)//tei:persName[string(@ref) eq $identifier or ip:getIdFromUri(string(@ref)) eq $listId or substring-after(string(@ref), '#') eq $basicId or substring-before(substring-after(string(@source), 'persons/'), '.xml') eq $basicId][1], $element-cache-string as xs:string* := string-join(for $found-element in $collections where count($found-element) > 0 return $found-element, ' ') return ( $element-string,$element-cache-string ),' ')) ) }; Please help. -- Lars Scheideler - wiss. technischer Mitarbeiter - Althochdeutsches Wörterbuch & Digital Humanities Sächsische Akademie der Wissenschaften zu Leipzig Karl-Tauchnitz-Straße 1 04107 Leipzig sch...@sa... www.saw-leipzig.de |