Index: documentation/helpcontent2/makefile.pmk diff -u documentation/helpcontent2/makefile.pmk:1.21 documentation/helpcontent2/makefile.pmk:1.21.6.1 --- documentation/helpcontent2/makefile.pmk:1.21 Thu Apr 26 06:55:36 2007 +++ documentation/helpcontent2/makefile.pmk Sun May 20 07:00:14 2007 @@ -36,9 +36,7 @@ SHELL_PACKAGE:=$(subst,/,$/ $(PACKAGE)) HLANGXHPFILES:=$(foreach,i,$(XHPFILES) $(foreach,j,$(aux_alllangiso) $(COMMONMISC)$/$j$/$(SHELL_PACKAGE)$/$(i:f))) -.IF "$(SOLAR_JAVA)"!="" ALLTAR : $(COMMONMISC)$/$(TARGET).done $(COMMONMISC)$/xhp_changed.flag optix -.ENDIF $(HLANGXHPFILES) : $$(@:d)thisdir.created Index: documentation/helpcontent2/helpers/linkmakefile.template diff -u documentation/helpcontent2/helpers/linkmakefile.template:1.8 documentation/helpcontent2/helpers/linkmakefile.template:1.8.162.1 --- documentation/helpcontent2/helpers/linkmakefile.template:1.8 Wed Dec 14 07:33:48 2005 +++ documentation/helpcontent2/helpers/linkmakefile.template Sun May 20 06:59:34 2007 @@ -26,7 +26,6 @@ .INCLUDE : settings.mk .INCLUDE : $(PRJ)$/settings.pmk -.IF "$(SOLAR_JAVA)"!="" common_build_zip:= zip1generatedlangs=TRUE zip1langdirs=$(aux_alllangiso) @@ -34,8 +33,6 @@ ZIP1FLAGS= -u -r ZIP1DIR=$(COMMONMISC)$/$(LANGDIR) ZIP1LIST=$(LANGDIR)$/text$/%module%$/* -x "*.xhp????*" -x "*.dphh*" -x "*.hzip" -x "*.created" -.ENDIF # "$(SOLAR_JAVA)"!="" - LINKNAME=%module% LINKADDEDFILES= \ Index: documentation/helpcontent2/util/target.pmk diff -u documentation/helpcontent2/util/target.pmk:1.16 documentation/helpcontent2/util/target.pmk:1.16.20.2 --- documentation/helpcontent2/util/target.pmk:1.16 Tue Feb 6 06:12:39 2007 +++ documentation/helpcontent2/util/target.pmk Thu May 24 01:50:14 2007 @@ -36,27 +36,7 @@ LINKALLTARGETS=$(foreach,i,$(aux_alllangiso) $(BIN)$/$(LINKNAME)_$i.zip) LINKALLADDEDDEPS=$(foreach,i,$(aux_alllangiso) $(subst,LANGUAGE,$i $(LINKADDEDDPES))) -.IF "$(SOLAR_JAVA)"!="" ALLTAR : $(LINKALLTARGETS) -.ENDIF - -.IF "$(SYSTEM_DB)" != "YES" -JAVA_LIBRARY_PATH= -Djava.library.path=$(SOLARSHAREDBIN) -.ENDIF - -.IF "$(JDK)" != "gcj" -JAVA_VM_HEAP_SIZE = -Xms256m -Xmx256m -.ENDIF - -.IF "$(JAVAAOTCOMPILER)" != "" -CLASSPATH!:=$(my_cp) -.EXPORT : CLASSPATH -.ENDIF $(LINKALLTARGETS) : $(foreach,i,$(LINKLINKFILES) $(COMMONMISC)$/$$(@:b:s/_/./:e:s/.//)/$i) $(subst,LANGUAGE,$$(@:b:s/_/./:e:s/.//) $(LINKADDEDDEPS)) $(COMMONMISC)$/xhp_changed.flag -.IF "$(JAVAAOTCOMPILER)" != "" - com.sun.star.help.HelpLinker @$(mktmp -mod $(LINKNAME) -hid $(PRJ)$/helpers/hid.lst -src $(COMMONMISC) -sty $(PRJ)$/source$/auxiliary$/embed.xsl -idx $(PRJ)$/source$/auxiliary$/index.xsl -lang {$(subst,$(LINKNAME)_, $(@:b))} $(subst,LANGUAGE,{$(subst,$(LINKNAME)_, $(@:b))} $(LINKADDEDFILES)) $(foreach,i,$(LINKLINKFILES) $(COMMONMISC)$/{$(subst,$(LINKNAME)_, $(@:b))}/$i) -o $@) -.ELSE - $(JAVAI) $(JAVA_VM_HEAP_SIZE) $(JAVA_LIBRARY_PATH) -Djavax.xml.parsers.SAXParserFactory=org.apache.xerces.jaxp.SAXParserFactoryImpl -Djavax.xml.parsers.DocumentBuilderFactory=org.apache.xerces.jaxp.DocumentBuilderFactoryImpl -cp $(my_cp) com.sun.star.help.HelpLinker @$(mktmp -mod $(LINKNAME) -hid $(PRJ)$/helpers/hid.lst -src $(COMMONMISC) -sty $(PRJ)$/source$/auxiliary$/embed.xsl -idx $(PRJ)$/source$/auxiliary$/index.xsl -lang {$(subst,$(LINKNAME)_, $(@:b))} $(subst,LANGUAGE,{$(subst,$(LINKNAME)_, $(@:b))} $(LINKADDEDFILES)) $(foreach,i,$(LINKLINKFILES) $(COMMONMISC)$/{$(subst,$(LINKNAME)_, $(@:b))}/$i) -o $@) -.ENDIF - + $(WRAPCMD) HelpLinker @$(mktmp -mod $(LINKNAME) -hid $(PRJ)$/helpers/hid.lst -src $(COMMONMISC) -sty $(PRJ)$/source$/auxiliary$/embed.xsl -idx $(PRJ)$/source$/auxiliary$/index.xsl -lang {$(subst,$(LINKNAME)_, $(@:b))} $(subst,LANGUAGE,{$(subst,$(LINKNAME)_, $(@:b))} $(LINKADDEDFILES)) $(foreach,i,$(LINKLINKFILES) $(COMMONMISC)$/{$(subst,$(LINKNAME)_, $(@:b))}/$i) -o $@) Index: documentation/helpcontent2/util/sbasic/makefile.mk diff -u documentation/helpcontent2/util/sbasic/makefile.mk:1.27 documentation/helpcontent2/util/sbasic/makefile.mk:1.27.10.1 --- documentation/helpcontent2/util/sbasic/makefile.mk:1.27 Mon Apr 2 08:59:55 2007 +++ documentation/helpcontent2/util/sbasic/makefile.mk Sun May 20 07:01:31 2007 @@ -26,7 +26,6 @@ .INCLUDE : settings.mk .INCLUDE : $(PRJ)$/settings.pmk -.IF "$(SOLAR_JAVA)"!="" common_build_zip:= zip1generatedlangs=TRUE zip1langdirs=$(aux_alllangiso) @@ -34,8 +33,6 @@ ZIP1FLAGS= -u -r ZIP1DIR=$(COMMONMISC)$/$(LANGDIR) ZIP1LIST=$(LANGDIR)$/text$/sbasic$/* -x "*.xhp????*" -x "*.dphh*" -x "*.hzip" -x "*.created" -.ENDIF # "$(SOLAR_JAVA)"!="" - LINKNAME=sbasic LINKADDEDFILES= \ Index: documentation/helpcontent2/util/scalc/makefile.mk diff -u documentation/helpcontent2/util/scalc/makefile.mk:1.31 documentation/helpcontent2/util/scalc/makefile.mk:1.31.10.1 --- documentation/helpcontent2/util/scalc/makefile.mk:1.31 Mon Apr 2 09:00:11 2007 +++ documentation/helpcontent2/util/scalc/makefile.mk Sun May 20 07:01:31 2007 @@ -26,7 +26,6 @@ .INCLUDE : settings.mk .INCLUDE : $(PRJ)$/settings.pmk -.IF "$(SOLAR_JAVA)"!="" common_build_zip:= zip1generatedlangs=TRUE zip1langdirs=$(aux_alllangiso) @@ -34,8 +33,6 @@ ZIP1FLAGS= -u -r ZIP1DIR=$(COMMONMISC)$/$(LANGDIR) ZIP1LIST=$(LANGDIR)$/text$/scalc$/* -x "*.xhp????*" -x "*.dphh*" -x "*.hzip" -x "*.created" -.ENDIF # "$(SOLAR_JAVA)"!="" - LINKNAME=scalc LINKADDEDFILES= \ Index: documentation/helpcontent2/util/schart/makefile.mk diff -u documentation/helpcontent2/util/schart/makefile.mk:1.16 documentation/helpcontent2/util/schart/makefile.mk:1.16.156.1 --- documentation/helpcontent2/util/schart/makefile.mk:1.16 Wed Dec 14 07:34:45 2005 +++ documentation/helpcontent2/util/schart/makefile.mk Sun May 20 07:01:31 2007 @@ -26,7 +26,6 @@ .INCLUDE : settings.mk .INCLUDE : $(PRJ)$/settings.pmk -.IF "$(SOLAR_JAVA)"!="" common_build_zip:= zip1generatedlangs=TRUE zip1langdirs=$(aux_alllangiso) @@ -34,8 +33,6 @@ ZIP1FLAGS= -u -r ZIP1DIR=$(COMMONMISC)$/$(LANGDIR) ZIP1LIST=$(LANGDIR)$/text$/schart$/* -x "*.xhp????*" -x "*.dphh*" -x "*.hzip" -x "*.created" -.ENDIF # "$(SOLAR_JAVA)"!="" - LINKNAME=schart LINKADDEDFILES= \ Index: documentation/helpcontent2/util/sdatabase/makefile.mk diff -u documentation/helpcontent2/util/sdatabase/makefile.mk:1.16 documentation/helpcontent2/util/sdatabase/makefile.mk:1.16.10.1 --- documentation/helpcontent2/util/sdatabase/makefile.mk:1.16 Mon Apr 2 09:00:51 2007 +++ documentation/helpcontent2/util/sdatabase/makefile.mk Sun May 20 07:01:32 2007 @@ -26,7 +26,6 @@ .INCLUDE : settings.mk .INCLUDE : $(PRJ)$/settings.pmk -.IF "$(SOLAR_JAVA)"!="" common_build_zip:= zip1generatedlangs=TRUE zip1langdirs=$(aux_alllangiso) @@ -34,8 +33,6 @@ ZIP1FLAGS= -u -r ZIP1DIR=$(COMMONMISC)$/$(LANGDIR) ZIP1LIST=$(LANGDIR)$/text$/shared$/explorer$/database$/main.xhp -.ENDIF # "$(SOLAR_JAVA)"!="" - LINKNAME=sdatabase LINKADDEDFILES= \ Index: documentation/helpcontent2/util/sdraw/makefile.mk diff -u documentation/helpcontent2/util/sdraw/makefile.mk:1.28 documentation/helpcontent2/util/sdraw/makefile.mk:1.28.10.1 --- documentation/helpcontent2/util/sdraw/makefile.mk:1.28 Mon Apr 2 09:01:04 2007 +++ documentation/helpcontent2/util/sdraw/makefile.mk Sun May 20 07:01:32 2007 @@ -26,7 +26,6 @@ .INCLUDE : settings.mk .INCLUDE : $(PRJ)$/settings.pmk -.IF "$(SOLAR_JAVA)"!="" common_build_zip:= zip1generatedlangs=TRUE zip1langdirs=$(aux_alllangiso) @@ -34,8 +33,6 @@ ZIP1FLAGS= -u -r ZIP1DIR=$(COMMONMISC)$/$(LANGDIR) ZIP1LIST=$(LANGDIR)$/text$/sdraw$/* -x "*.xhp????*" -x "*.dphh*" -x "*.hzip" -x "*.created" -.ENDIF # "$(SOLAR_JAVA)"!="" - LINKNAME=sdraw LINKADDEDFILES= \ Index: documentation/helpcontent2/util/shared/makefile.mk diff -u documentation/helpcontent2/util/shared/makefile.mk:1.15 documentation/helpcontent2/util/shared/makefile.mk:1.15.156.1 --- documentation/helpcontent2/util/shared/makefile.mk:1.15 Wed Dec 14 07:35:24 2005 +++ documentation/helpcontent2/util/shared/makefile.mk Sun May 20 07:01:32 2007 @@ -26,7 +26,6 @@ .INCLUDE : settings.mk .INCLUDE : $(PRJ)$/settings.pmk -.IF "$(SOLAR_JAVA)"!="" common_build_zip:= zip1generatedlangs=TRUE zip1langdirs=$(aux_alllangiso) @@ -34,8 +33,6 @@ ZIP1FLAGS= -u -r ZIP1DIR=$(COMMONMISC)$/$(LANGDIR) ZIP1LIST=$(LANGDIR)$/text$/shared$/* -x "*.xhp????*" -x "*.dphh*" -x "*.hzip" -x "*.created" -.ENDIF # "$(SOLAR_JAVA)"!="" - LINKNAME=shared LINKADDEDFILES= \ Index: documentation/helpcontent2/util/simpress/makefile.mk diff -u documentation/helpcontent2/util/simpress/makefile.mk:1.27 documentation/helpcontent2/util/simpress/makefile.mk:1.27.10.1 --- documentation/helpcontent2/util/simpress/makefile.mk:1.27 Mon Apr 2 09:01:28 2007 +++ documentation/helpcontent2/util/simpress/makefile.mk Sun May 20 07:01:32 2007 @@ -26,7 +26,6 @@ .INCLUDE : settings.mk .INCLUDE : $(PRJ)$/settings.pmk -.IF "$(SOLAR_JAVA)"!="" common_build_zip:= zip1generatedlangs=TRUE zip1langdirs=$(aux_alllangiso) @@ -34,8 +33,6 @@ ZIP1FLAGS= -u -r ZIP1DIR=$(COMMONMISC)$/$(LANGDIR) ZIP1LIST=$(LANGDIR)$/text$/simpress$/* -x "*.xhp????*" -x "*.dphh*" -x "*.hzip" -x "*.created" -.ENDIF # "$(SOLAR_JAVA)"!="" - LINKNAME=simpress LINKADDEDFILES= \ Index: documentation/helpcontent2/util/smath/makefile.mk diff -u documentation/helpcontent2/util/smath/makefile.mk:1.26 documentation/helpcontent2/util/smath/makefile.mk:1.26.10.1 --- documentation/helpcontent2/util/smath/makefile.mk:1.26 Mon Apr 2 09:01:56 2007 +++ documentation/helpcontent2/util/smath/makefile.mk Sun May 20 07:01:32 2007 @@ -26,7 +26,6 @@ .INCLUDE : settings.mk .INCLUDE : $(PRJ)$/settings.pmk -.IF "$(SOLAR_JAVA)"!="" common_build_zip:= zip1generatedlangs=TRUE zip1langdirs=$(aux_alllangiso) @@ -34,8 +33,6 @@ ZIP1FLAGS= -u -r ZIP1DIR=$(COMMONMISC)$/$(LANGDIR) ZIP1LIST=$(LANGDIR)$/text$/smath$/* -x "*.xhp????*" -x "*.dphh*" -x "*.hzip" -x "*.created" -.ENDIF # "$(SOLAR_JAVA)"!="" - LINKNAME=smath LINKADDEDFILES= \ Index: documentation/helpcontent2/util/swriter/makefile.mk diff -u documentation/helpcontent2/util/swriter/makefile.mk:1.30 documentation/helpcontent2/util/swriter/makefile.mk:1.30.10.1 --- documentation/helpcontent2/util/swriter/makefile.mk:1.30 Mon Apr 2 09:02:09 2007 +++ documentation/helpcontent2/util/swriter/makefile.mk Sun May 20 07:01:33 2007 @@ -26,7 +26,6 @@ .INCLUDE : settings.mk .INCLUDE : $(PRJ)$/settings.pmk -.IF "$(SOLAR_JAVA)"!="" common_build_zip:= zip1generatedlangs=TRUE zip1langdirs=$(aux_alllangiso) @@ -34,8 +33,6 @@ ZIP1FLAGS= -u -r ZIP1DIR=$(COMMONMISC)$/$(LANGDIR) ZIP1LIST=$(LANGDIR)$/text$/swriter$/* -x "*.xhp????*" -x "*.dphh*" -x "*.hzip" -x "*.created" -.ENDIF # "$(SOLAR_JAVA)"!="" - LINKNAME=swriter LINKADDEDFILES= \ Index: external/boost/boost-1.30.2.patch diff -u external/boost/boost-1.30.2.patch:1.7 external/boost/boost-1.30.2.patch:1.7.2.1 --- external/boost/boost-1.30.2.patch:1.7 Mon Mar 26 04:22:12 2007 +++ external/boost/boost-1.30.2.patch Sun May 20 05:20:10 2007 @@ -668,3 +668,45 @@ + #endif // BOOST_MPL_SEQUENCE_TAG_HPP_INCLUDED +*** misc/build/boost-1.30.2/boost/token_functions.hpp 2007-05-20 07:38:11.000000000 -0400 +--- misc/modified/boost-1.30.2/boost/token_functions.hpp 2007-05-20 08:05:05.000000000 -0400 +*************** +*** 64,70 **** + // character (backslash \), can be assigned to other characters. + + struct escaped_list_error : public std::runtime_error{ +! escaped_list_error(const std::string& what):std::runtime_error(what) { } + }; + + +--- 64,70 ---- + // character (backslash \), can be assigned to other characters. + + struct escaped_list_error : public std::runtime_error{ +! escaped_list_error(const std::string& s_what) : std::runtime_error(s_what) { } + }; + + +*** misc/build/boost-1.30.2/boost/token_iterator.hpp 2007-05-20 07:38:10.000000000 -0400 +--- misc/modified/boost-1.30.2/boost/token_iterator.hpp 2007-05-20 08:10:20.000000000 -0400 +*************** +*** 64,72 **** + + template + typename Iterator::reference +! dereference(const Iterator& a) const{ + using namespace std; +! assert(a.base().valid_); + return tok_; + } + template +--- 64,72 ---- + + template + typename Iterator::reference +! dereference(const Iterator& /*a*/) const{ + using namespace std; +! //assert(a.base().valid_); + return tok_; + } + template Index: external/boost/spirit-1.6.1.patch diff -u external/boost/spirit-1.6.1.patch:1.7 external/boost/spirit-1.6.1.patch:1.7.14.3 --- external/boost/spirit-1.6.1.patch:1.7 Thu Oct 12 08:54:57 2006 +++ external/boost/spirit-1.6.1.patch Wed May 23 12:08:04 2007 @@ -1564,3 +1564,150 @@ + #endif + #endif // BOOST_UTILITY_ADDRESSOF_HPP +*** misc/spirit-1.6.1/miniboost/boost/concept_check.hpp 2007-05-20 07:38:14.000000000 -0400 +--- misc/build/spirit-1.6.1/miniboost/boost/concept_check.hpp 2007-05-20 07:52:59.000000000 -0400 +*************** +*** 708,719 **** + function_requires< AssignableConcept >(); + const_constraints(c); + } +! void const_constraints(const Container& c) { +! i = c.begin(); +! i = c.end(); +! n = c.size(); +! n = c.max_size(); +! b = c.empty(); + } + Container c; + bool b; +--- 708,719 ---- + function_requires< AssignableConcept >(); + const_constraints(c); + } +! void const_constraints(const Container& cnr) { +! i = cnr.begin(); +! i = cnr.end(); +! n = cnr.size(); +! n = cnr.max_size(); +! b = cnr.empty(); + } + Container c; + bool b; +*************** +*** 777,785 **** + BidirectionalIteratorConcept >(); + const_constraints(c); + } +! void const_constraints(const ReversibleContainer& c) { +! const_reverse_iterator i = c.rbegin(); +! i = c.rend(); + } + ReversibleContainer c; + }; +--- 777,785 ---- + BidirectionalIteratorConcept >(); + const_constraints(c); + } +! void const_constraints(const ReversibleContainer& cnr) { +! const_reverse_iterator i = cnr.rbegin(); +! i = cnr.rend(); + } + ReversibleContainer c; + }; +*************** +*** 821,828 **** + + const_constraints(c); + } +! void const_constraints(const RandomAccessContainer& c) { +! const_reference r = c[n]; + ignore_unused_variable_warning(r); + } + RandomAccessContainer c; +--- 821,828 ---- + + const_constraints(c); + } +! void const_constraints(const RandomAccessContainer& cnr) { +! const_reference r = cnr[n]; + ignore_unused_variable_warning(r); + } + RandomAccessContainer c; +*************** +*** 925,932 **** + reference r = c.back(); + ignore_unused_variable_warning(r); + } +! void const_constraints(const BackInsertionSequence& c) { +! const_reference r = c.back(); + ignore_unused_variable_warning(r); + }; + BackInsertionSequence c; +--- 925,932 ---- + reference r = c.back(); + ignore_unused_variable_warning(r); + } +! void const_constraints(const BackInsertionSequence& cnr) { +! const_reference r = cnr.back(); + ignore_unused_variable_warning(r); + }; + BackInsertionSequence c; +*************** +*** 947,956 **** + c.erase(r.first, r.second); + const_constraints(c); + } +! void const_constraints(const AssociativeContainer& c) { +! ci = c.find(k); +! n = c.count(k); +! cr = c.equal_range(k); + } + typedef typename AssociativeContainer::iterator iterator; + typedef typename AssociativeContainer::const_iterator const_iterator; +--- 947,956 ---- + c.erase(r.first, r.second); + const_constraints(c); + } +! void const_constraints(const AssociativeContainer& cnr) { +! ci = cnr.find(k); +! n = cnr.count(k); +! cr = cnr.equal_range(k); + } + typedef typename AssociativeContainer::iterator iterator; + typedef typename AssociativeContainer::const_iterator const_iterator; +*** misc/spirit-1.6.1/miniboost/boost/type_traits/type_with_alignment.hpp Wed May 23 19:45:34 2007 +--- misc/build/spirit-1.6.1/miniboost/boost/type_traits/type_with_alignment.hpp Wed May 23 19:47:00 2007 +*************** +*** 72,78 **** + #undef BOOST_TT_CHOOSE_MIN_ALIGNMENT + #undef BOOST_TT_CHOOSE_T + +! template + struct is_aligned + { + BOOST_STATIC_CONSTANT(bool, +--- 72,78 ---- + #undef BOOST_TT_CHOOSE_MIN_ALIGNMENT + #undef BOOST_TT_CHOOSE_T + +! template + struct is_aligned + { + BOOST_STATIC_CONSTANT(bool, +*************** +*** 93,99 **** + + // This alignment method originally due to Brian Parker, implemented by David + // Abrahams, and then ported here by Doug Gregor. +! template + class type_with_alignment + { + typedef detail::lower_alignment t1; +--- 93,99 ---- + + // This alignment method originally due to Brian Parker, implemented by David + // Abrahams, and then ported here by Doug Gregor. +! template + class type_with_alignment + { + typedef detail::lower_alignment t1; Index: external/boost/prj/d.lst diff -u external/boost/prj/d.lst:1.11 external/boost/prj/d.lst:1.11.10.1 --- external/boost/prj/d.lst:1.11 Wed Dec 13 07:03:08 2006 +++ external/boost/prj/d.lst Sat May 19 13:23:52 2007 @@ -2,6 +2,9 @@ ..\%__SRC%\misc\build\boost-1.30.2\boost\rational.hpp %_DEST%\inc%_EXT%\boost\rational.hpp ..\%__SRC%\misc\build\boost-1.30.2\boost\operators.hpp %_DEST%\inc%_EXT%\boost\operators.hpp +..\%__SRC%\misc\build\boost-1.30.2\boost\tokenizer.hpp %_DEST%\inc%_EXT%\boost\tokenizer.hpp +..\%__SRC%\misc\build\boost-1.30.2\boost\token_iterator.hpp %_DEST%\inc%_EXT%\boost\token_iterator.hpp +..\%__SRC%\misc\build\boost-1.30.2\boost\token_functions.hpp %_DEST%\inc%_EXT%\boost\token_functions.hpp ..\%__SRC%\misc\build\boost-1.30.2\boost\enable_shared_from_this.hpp %_DEST%\inc%_EXT%\boost\enable_shared_from_this.hpp ..\%__SRC%\misc\build\boost-1.30.2\boost\cast.hpp %_DEST%\inc%_EXT%\boost\cast.hpp Index: util/xmlhelp/prj/build.lst diff -u util/xmlhelp/prj/build.lst:1.12 util/xmlhelp/prj/build.lst:1.12.2.1 --- util/xmlhelp/prj/build.lst:1.12 Thu May 10 06:15:49 2007 +++ util/xmlhelp/prj/build.lst Sat May 19 05:16:07 2007 @@ -1,4 +1,4 @@ -xh xmlhelp : ucbhelper XmlSearch LIBXSLT:libxslt jut unoil BERKELEYDB:berkeleydb svtools NULL +xh xmlhelp : ucbhelper LIBXSLT:libxslt jut unoil BERKELEYDB:berkeleydb svtools NULL xh xmlhelp usr1 - all xh_mkout NULL xh xmlhelp\inc nmake - all xh_inc NULL xh xmlhelp\source\helpprovider nmake - all xh_helpprovider xh_inc NULL Index: util/xmlhelp/prj/d.lst diff -u util/xmlhelp/prj/d.lst:1.14 util/xmlhelp/prj/d.lst:1.14.38.2 --- util/xmlhelp/prj/d.lst:1.14 Wed Jul 5 14:13:02 2006 +++ util/xmlhelp/prj/d.lst Wed May 23 12:41:18 2007 @@ -1,5 +1,5 @@ ..\%__SRC%\bin\*.dll %_DEST%\bin%_EXT%\*.* -..\%__SRC%\bin\com.sun.star.help.HelpLinker %_DEST%\bin%_EXT%\com.sun.star.help.HelpLinker +..\%__SRC%\bin\HelpLinker* %_DEST%\bin%_EXT% ..\%__SRC%\lib\lib*.so %_DEST%\lib%_EXT% ..\%__SRC%\lib\*.dylib %_DEST%\lib%_EXT%\*.* ..\%__SRC%\class\*.jar %_DEST%\bin%_EXT%\*.* Index: util/xmlhelp/source/com/sun/star/help/HelpCompiler.cxx diff -u /dev/null util/xmlhelp/source/com/sun/star/help/HelpCompiler.cxx:1.1.2.3 --- /dev/null Mon Jun 11 01:20:10 2007 +++ util/xmlhelp/source/com/sun/star/help/HelpCompiler.cxx Sat Jun 9 06:14:05 2007 @@ -0,0 +1,534 @@ +/************************************************************************* + * + * OpenOffice.org - a multi-platform office productivity suite + * + * $RCSfile$ + * + * $Revision$ + * + * last change: $Author$ $Date$ + * + * The Contents of this file are made available subject to + * the terms of GNU Lesser General Public License Version 2.1. + * + * + * GNU Lesser General Public License Version 2.1 + * ============================================= + * Copyright 2005 by Sun Microsystems, Inc. + * 901 San Antonio Road, Palo Alto, CA 94303, USA + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ************************************************************************/ + + +#include "HelpCompiler.hxx" +#include +#include +#include +#include +#include +#include + +HelpCompiler::HelpCompiler(StreamTable &in_streamTable, const fs::path &in_inputFile, + const fs::path &in_src, const fs::path &in_resEmbStylesheet, + const std::string &in_module, const std::string &in_lang) + : streamTable(in_streamTable), inputFile(in_inputFile), + src(in_src), module(in_module), lang(in_lang), resEmbStylesheet(in_resEmbStylesheet) +{ + xmlKeepBlanksDefaultValue = 0; +} + +xmlDocPtr HelpCompiler::getSourceDocument(const fs::path &filePath) +{ + static const char *params[4 + 1]; + static xsltStylesheetPtr cur = NULL; + if (!cur) + { + static std::string fsroot('\'' + src.toUTF8() + '\''); + static std::string esclang('\'' + lang + '\''); + + xmlSubstituteEntitiesDefault(1); + xmlLoadExtDtdDefaultValue = 1; + cur = xsltParseStylesheetFile((const xmlChar *)resEmbStylesheet.native_file_string().c_str()); + + int nbparams = 0; + params[nbparams++] = "Language"; + params[nbparams++] = esclang.c_str(); + params[nbparams++] = "fsroot"; + params[nbparams++] = fsroot.c_str(); + params[nbparams] = NULL; + } + xmlDocPtr doc = xmlParseFile(filePath.native_file_string().c_str()); + xmlDocPtr res = xsltApplyStylesheet(cur, doc, params); + xmlFreeDoc(doc); + return res; +} + +HashSet HelpCompiler::switchFind(xmlDocPtr doc) +{ + HashSet hs; + xmlChar *xpath = (xmlChar*)"//switchinline"; + + xmlXPathContextPtr context = xmlXPathNewContext(doc); + xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context); + xmlXPathFreeContext(context); + if (result) + { + xmlNodeSetPtr nodeset = result->nodesetval; + for (int i = 0; i < nodeset->nodeNr; i++) + { + xmlNodePtr el = nodeset->nodeTab[i]; + xmlChar *select = xmlGetProp(el, (xmlChar*)"select"); + if (select) + { + if (!strcmp((const char*)select, "appl")) + { + xmlNodePtr n1 = el->xmlChildrenNode; + while (n1) + { + if ((!xmlStrcmp(n1->name, (const xmlChar*)"caseinline"))) + { + xmlChar *appl = xmlGetProp(n1, (xmlChar*)"select"); + hs.push_back(std::string((const char*)appl)); + xmlFree(appl); + } + else if ((!xmlStrcmp(n1->name, (const xmlChar*)"defaultinline"))) + hs.push_back(std::string("DEFAULT")); + n1 = n1->next; + } + } + xmlFree(select); + } + } + xmlXPathFreeObject(result); + } + hs.push_back(std::string("DEFAULT")); + return hs; +} + +// returns a node representing the whole stuff compiled for the current +// application. +xmlNodePtr HelpCompiler::clone(xmlNodePtr node, const std::string& appl) +{ + xmlNodePtr parent = xmlCopyNode(node, 2); + xmlNodePtr n = node->xmlChildrenNode; + while (n != NULL) + { + bool isappl = false; + if ( (!strcmp((const char*)n->name, "switchinline")) || + (!strcmp((const char*)n->name, "switch")) ) + { + xmlChar *select = xmlGetProp(n, (xmlChar*)"select"); + if (select) + { + if (!strcmp((const char*)select, "appl")) + isappl = true; + xmlFree(select); + } + } + if (isappl) + { + xmlNodePtr caseNode = n->xmlChildrenNode; + if (appl == "DEFAULT") + { + while (caseNode) + { + if (!strcmp((const char*)caseNode->name, "defaultinline")) + { + xmlNodePtr cnl = caseNode->xmlChildrenNode; + while (cnl) + { + xmlAddChild(parent, clone(cnl, appl)); + cnl = cnl->next; + } + break; + } + caseNode = caseNode->next; + } + } + else + { + while (caseNode) + { + isappl=false; + if (!strcmp((const char*)caseNode->name, "caseinline")) + { + xmlChar *select = xmlGetProp(n, (xmlChar*)"select"); + if (select) + { + if (!strcmp((const char*)select, appl.c_str())) + isappl = true; + xmlFree(select); + } + if (isappl) + { + xmlNodePtr cnl = caseNode->xmlChildrenNode; + while (cnl) + { + xmlAddChild(parent, clone(cnl, appl)); + cnl = cnl->next; + } + break; + } + + } + caseNode = caseNode->next; + } + } + + } + else + xmlAddChild(parent, clone(n, appl)); + + n = n->next; + } + return parent; +} + +class myparser +{ +public: + std::string documentId; + std::string fileName; + std::string title; + HashSet *hidlist; + Hashtable *keywords; + Stringtable *helptexts; +private: + HashSet extendedHelpText; +public: + myparser(const std::string &indocumentId, const std::string &infileName, + const std::string &intitle) : documentId(indocumentId), fileName(infileName), + title(intitle) + { + hidlist = new HashSet; + keywords = new Hashtable; + helptexts = new Stringtable; + } + void traverse( xmlNodePtr parentNode ); +private: + std::string dump(xmlNodePtr node); +}; + +std::string myparser::dump(xmlNodePtr node) +{ + std::string app; + if (node->xmlChildrenNode) + { + xmlNodePtr list = node->xmlChildrenNode; + while (list) + { + app += dump(list); + list = list->next; + } + } + if (xmlNodeIsText(node)) + { + xmlChar *pContent = xmlNodeGetContent(node); + app += std::string((const char*)pContent); + xmlFree(pContent); + // std::cout << app << std::endl; + } + return app; +} + +void trim(std::string& str) +{ + std::string::size_type pos = str.find_last_not_of(' '); + if(pos != std::string::npos) + { + str.erase(pos + 1); + pos = str.find_first_not_of(' '); + if(pos != std::string::npos) + str.erase(0, pos); + } + else + str.erase(str.begin(), str.end()); +} + +void myparser::traverse( xmlNodePtr parentNode ) +{ + // traverse all nodes that belong to the parent + xmlNodePtr test ; + for (test = parentNode->xmlChildrenNode; test; test = test->next) + { + if (fileName.empty() && !strcmp((const char*)test->name, "filename")) + { + xmlNodePtr node = test->xmlChildrenNode; + if (xmlNodeIsText(node)) + { + xmlChar *pContent = xmlNodeGetContent(node); + fileName = std::string((const char*)pContent); + xmlFree(pContent); + } + } + else if (title.empty() && !strcmp((const char*)test->name, "title")) + { + title = dump(test); + if (title.empty()) + title = ""; + } + else if (!strcmp((const char*)test->name, "bookmark")) + { + xmlChar *branchxml = xmlGetProp(test, (const xmlChar*)"branch"); + xmlChar *idxml = xmlGetProp(test, (const xmlChar*)"id"); + std::string branch((const char*)branchxml); + std::string anchor((const char*)idxml); + xmlFree (branchxml); + xmlFree (idxml); + + std::string hid; + + if (branch.find("hid") == 0) + { + size_t index = branch.find('/'); + if (index != std::string::npos) + { + hid = branch.substr(1 + index); + // one shall serve as a documentId + if (documentId.empty()) + documentId = hid; + extendedHelpText.push_back(hid); + std::string foo = anchor.empty() ? hid : hid + "#" + anchor; + HCDBG(std::cerr << "hid pushback" << foo << std::endl); + hidlist->push_back( anchor.empty() ? hid : hid + "#" + anchor); + } + else + continue; + } + else if (branch.compare("index") == 0) + { + LinkedList ll; + + for (xmlNodePtr nd = test->xmlChildrenNode; nd; nd = nd->next) + { + if (strcmp((const char*)nd->name, "bookmark_value")) + continue; + + std::string embedded; + xmlChar *embeddedxml = xmlGetProp(nd, (const xmlChar*)"embedded"); + if (embeddedxml) + { + embedded = std::string((const char*)embeddedxml); + xmlFree (embeddedxml); + std::transform (embedded.begin(), embedded.end(), + embedded.begin(), tolower); + } + + bool isEmbedded = !embedded.empty() && embedded.compare("true") == 0; + if (isEmbedded) + continue; + + std::string keyword = dump(nd); + size_t keywordSem = keyword.find(';'); + if (keywordSem != std::string::npos) + { + std::string tmppre = + keyword.substr(0,keywordSem); + trim(tmppre); + std::string tmppos = + keyword.substr(1+keywordSem); + trim(tmppos); + keyword = tmppre + ";" + tmppos; + } + ll.push_back(keyword); + } + if (!ll.empty()) + (*keywords)[anchor] = ll; + } + else if (branch.compare("contents") == 0) + { + // currently not used + } + } + else if (!strcmp((const char*)test->name, "ahelp")) + { + std::string text = dump(test); + trim(text); + std::string name; + + HashSet::const_iterator aEnd = extendedHelpText.end(); + for (HashSet::const_iterator iter = extendedHelpText.begin(); iter != aEnd; + ++iter) + { + name = *iter; + (*helptexts)[name] = text; + } + extendedHelpText.clear(); + } + + // traverse children + traverse(test); + } +} + +bool HelpCompiler::compile() +{ + // we now have the jaroutputstream, which will contain the document. + // now determine the document as a dom tree in variable docResolved + + xmlDocPtr docResolvedOrg = getSourceDocument(inputFile); + + // now add path to the document + // resolve the dom + if (!docResolvedOrg) + { + std::cerr << "ERROR: file not existing: " << src.native_file_string().c_str() << std::endl; + exit(1); + } + + // now find all applications for which one has to compile + std::string documentId; + std::string fileName; + std::string title; + // returns all applications for which one has to compile + HashSet applications = switchFind(docResolvedOrg); + + HashSet::const_iterator aEnd = applications.end(); + for (HashSet::const_iterator aI = applications.begin(); aI != aEnd; ++aI) + { + std::string appl = *aI; + std::string modulename = appl; + if (modulename[0] == 'S') + { + modulename = modulename.substr(1); + std::transform(modulename.begin(), modulename.end(), modulename.begin(), tolower); + } + if (modulename != "DEFAULT" && modulename != module) + continue; + + // returns a clone of the document with swich-cases resolved + xmlNodePtr docResolved = clone(xmlDocGetRootElement(docResolvedOrg), appl); + myparser aparser(documentId, fileName, title); + aparser.traverse(docResolved); + + documentId = aparser.documentId; + fileName = aparser.fileName; + title = aparser.title; + + HCDBG(std::cerr << documentId << " : " << fileName << " : " << title << std::endl); + + xmlDocPtr docResolvedDoc = xmlCopyDoc(docResolvedOrg, false); + xmlDocSetRootElement(docResolvedDoc, docResolved); + + if (modulename == "DEFAULT") + { + streamTable.default_doc = docResolvedDoc; + streamTable.default_hidlist = aparser.hidlist; + streamTable.default_helptexts = aparser.helptexts; + streamTable.default_keywords = aparser.keywords; + } + else if (modulename == module) + { + streamTable.appl_doc = docResolvedDoc; + streamTable.appl_hidlist = aparser.hidlist; + streamTable.appl_helptexts = aparser.helptexts; + streamTable.appl_keywords = aparser.keywords; + } + else + { + std::cerr << "unexpected case situation" << std::endl; + exit(-1); + } + + } // end iteration over all applications + + streamTable.document_id = documentId; + streamTable.document_path = fileName; + streamTable.document_title = title; + std::string actMod = module; + if (!fileName.empty()) + { + if (fileName.find("/text/") == 0) + { + int len = strlen("/text/"); + actMod = fileName.substr(len); + actMod = actMod.substr(0, actMod.find('/')); + } + } + streamTable.document_module = actMod; + + xmlFreeDoc(docResolvedOrg); + return true; +} + +namespace fs +{ + void create_directory(const fs::path indexDirName) + { + HCDBG( + std::cerr << "creating " << + rtl::OUStringToOString(indexDirName.data, RTL_TEXTENCODING_UTF8).getStr() + << std::endl + ); + osl::Directory::createPath(indexDirName.data); + } + + void rename(const fs::path &src, const fs::path &dest) + { + osl::File::move(src.data, dest.data); + } + + bool exists(const fs::path &in) + { + osl::File tmp(in.data); + return (tmp.open(osl_File_OpenFlag_Read) == osl::FileBase::E_None); + } + + void remove(const fs::path &in) + { + osl::File::remove(in.data); + } + + void removeRecursive(rtl::OUString const& _suDirURL) + { + { + osl::Directory aDir(_suDirURL); + aDir.open(); + if (aDir.isOpen()) + { + osl::DirectoryItem aItem; + osl::FileStatus aStatus(osl_FileStatus_Mask_FileName | osl_FileStatus_Mask_Attributes); + while (aDir.getNextItem(aItem) == ::osl::FileBase::E_None) + { + if (osl::FileBase::E_None == aItem.getFileStatus(aStatus) && + aStatus.isValid(osl_FileStatus_Mask_FileName | osl_FileStatus_Mask_Attributes)) + { + rtl::OUString suFilename = aStatus.getFileName(); + rtl::OUString suFullFileURL; + suFullFileURL += _suDirURL; + suFullFileURL += rtl::OUString::createFromAscii("/"); + suFullFileURL += suFilename; + + if (aStatus.getFileType() == osl::FileStatus::Directory) + removeRecursive(suFullFileURL); + else + osl::File::remove(suFullFileURL); + } + } + aDir.close(); + } + } + osl::Directory::remove(_suDirURL); + } + + void remove_all(const fs::path &in) + { + removeRecursive(in.data); + } +} + +/* vi:set tabstop=4 shiftwidth=4 expandtab: */ Index: util/xmlhelp/source/com/sun/star/help/HelpCompiler.hxx diff -u /dev/null util/xmlhelp/source/com/sun/star/help/HelpCompiler.hxx:1.1.2.5 --- /dev/null Mon Jun 11 01:20:10 2007 +++ util/xmlhelp/source/com/sun/star/help/HelpCompiler.hxx Sat Jun 9 06:14:05 2007 @@ -0,0 +1,286 @@ +/************************************************************************* + * + * OpenOffice.org - a multi-platform office productivity suite + * + * $RCSfile$ + * + * $Revision$ + * + * last change: $Author$ $Date$ + * + * The Contents of this file are made available subject to + * the terms of GNU Lesser General Public License Version 2.1. + * + * + * GNU Lesser General Public License Version 2.1 + * ============================================= + * Copyright 2005 by Sun Microsystems, Inc. + * 901 San Antonio Road, Palo Alto, CA 94303, USA + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ************************************************************************/ + +#ifndef HELPCOMPILER_HXX +#define HELPCOMPILER_HXX + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef SYSTEM_DB +#include +#else +#include +#endif + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define EMULATEORIGINAL 1 +#define CMCDEBUG +#ifdef CMCDEBUG + #define HCDBG(foo) do { if (1) foo; } while(0) +#else + #define HCDBG(foo) do { if (0) foo; } while(0) +#endif + +namespace fs +{ + enum convert { native }; + class path + { + public: + ::rtl::OUString data; + public: + path() {} + path(const path &rOther) : data(rOther.data) {} + path(const std::string &in, convert) + { + rtl::OUString sWorkingDir; + osl_getProcessWorkingDir(&sWorkingDir.pData); + + rtl::OString tmp(in.c_str()); + rtl::OUString ustrSystemPath(rtl::OStringToOUString(tmp, osl_getThreadTextEncoding())); + osl::File::getFileURLFromSystemPath(ustrSystemPath, data); + osl::File::getAbsoluteFileURL(sWorkingDir, data, data); + } + std::string native_file_string() const + { + ::rtl::OUString ustrSystemPath; + osl::File::getSystemPathFromFileURL(data, ustrSystemPath); + rtl::OString tmp(rtl::OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding())); + HCDBG(std::cerr << "native_file_string is " << tmp.getStr() << std::endl); + return std::string(tmp.getStr()); + } + std::string native_directory_string() const { return native_file_string(); } + std::string toUTF8() const + { + rtl::OString tmp(rtl::OUStringToOString(data, RTL_TEXTENCODING_UTF8)); + return std::string(tmp.getStr()); + } + bool empty() const { return data.getLength() == 0; } + path operator/(const std::string &in) const + { + path ret(*this); + HCDBG(std::cerr << "orig was " << + rtl::OUStringToOString(ret.data, RTL_TEXTENCODING_UTF8).getStr() << std::endl); + rtl::OString tmp(in.c_str()); + rtl::OUString ustrSystemPath(rtl::OStringToOUString(tmp, osl_getThreadTextEncoding())); + ret.data += rtl::OUString(sal_Unicode('/')); + ret.data += ustrSystemPath; + HCDBG(std::cerr << "final is " << + rtl::OUStringToOString(ret.data, RTL_TEXTENCODING_UTF8).getStr() << std::endl); + return ret; + } + void append(const char *in) + { + rtl::OString tmp(in); + rtl::OUString ustrSystemPath(rtl::OStringToOUString(tmp, osl_getThreadTextEncoding())); + data = data + ustrSystemPath; + } + void append(const std::string &in) { append(in.c_str()); } + }; + + void create_directory(const fs::path indexDirName); + void rename(const fs::path &src, const fs::path &dest); + bool exists(const fs::path &in); + void remove_all(const fs::path &in); + void remove(const fs::path &in); +} + +struct joaat_hash +{ + size_t operator()(const std::string &str) const + { + size_t hash = 0; + const char *key = str.data(); + for (size_t i = 0; i < str.size(); i++) + { + hash += key[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + return hash; + } +}; + +#define get16bits(d) ((((sal_uInt32)(((const sal_uInt8 *)(d))[1])) << 8)\ + +(sal_uInt32)(((const sal_uInt8 *)(d))[0]) ) + +struct SuperFastHash +{ + size_t operator()(const std::string &str) const + { + const char * data = str.data(); + int len = str.size(); + size_t hash = len, tmp; + if (len <= 0 || data == NULL) return 0; + + int rem = len & 3; + len >>= 2; + + /* Main loop */ + for (;len > 0; len--) + { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (sal_uInt16); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) + { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= data[sizeof (sal_uInt16)] << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += *data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; + } +}; + +#define pref_hash joaat_hash + +typedef std::hash_map Stringtable; +typedef std::list LinkedList; +typedef std::vector HashSet; + +typedef std::hash_map Hashtable; + +class StreamTable +{ +public: + std::string document_id; + std::string document_path; + std::string document_module; + std::string document_title; + + HashSet *appl_hidlist; + Hashtable *appl_keywords; + Stringtable *appl_helptexts; + xmlDocPtr appl_doc; + + HashSet *default_hidlist; + Hashtable *default_keywords; + Stringtable *default_helptexts; + xmlDocPtr default_doc; + + StreamTable() : + appl_hidlist(NULL), appl_keywords(NULL), appl_helptexts(NULL), appl_doc(NULL), + default_hidlist(NULL), default_keywords(NULL), default_helptexts(NULL), default_doc(NULL) + {} + ~StreamTable() + { + delete appl_hidlist; + delete appl_keywords; + delete appl_helptexts; + if (appl_doc) xmlFreeDoc(appl_doc); + delete default_hidlist; + delete default_keywords; + delete default_helptexts; + if (default_doc) xmlFreeDoc(default_doc); + } +}; + +class HelpCompiler +{ +public: + HelpCompiler(StreamTable &streamTable, + const fs::path &in_inputFile, + const fs::path &in_src, + const fs::path &in_resEmbStylesheet, + const std::string &in_module, + const std::string &in_lang); + bool compile(void); + void addEntryToJarFile(const std::string &prefix, + const std::string &entryName, const std::string &bytesToAdd); + void addEntryToJarFile(const std::string &prefix, + const std::string &entryName, const HashSet &bytesToAdd); + void addEntryToJarFile(const std::string &prefix, + const std::string &entryName, const Stringtable &bytesToAdd); + void addEntryToJarFile(const std::string &prefix, + const std::string &entryName, const Hashtable &bytesToAdd); +private: + xmlDocPtr getSourceDocument(const fs::path &filePath); + HashSet switchFind(xmlDocPtr doc); + xmlNodePtr clone(xmlNodePtr node, const std::string& appl); + StreamTable &streamTable; + const fs::path inputFile, src; + const std::string module, lang; + const fs::path resEmbStylesheet; +}; + +#endif + +/* vi:set tabstop=4 shiftwidth=4 expandtab: */ Index: util/xmlhelp/source/com/sun/star/help/HelpLinker.cxx diff -u /dev/null util/xmlhelp/source/com/sun/star/help/HelpLinker.cxx:1.1.2.6 --- /dev/null Mon Jun 11 01:20:10 2007 +++ util/xmlhelp/source/com/sun/star/help/HelpLinker.cxx Sat Jun 9 06:14:05 2007 @@ -0,0 +1,5437 @@ +/************************************************************************* + * + * OpenOffice.org - a multi-platform office productivity suite + * + * $RCSfile$ + * + * $Revision$ + * + * last change: $Author$ $Date$ + * + * The Contents of this file are made available subject to + * the terms of GNU Lesser General Public License Version 2.1. + * + * + * GNU Lesser General Public License Version 2.1 + * ============================================= + * Copyright 2005 by Sun Microsystems, Inc. + * 901 San Antonio Road, Palo Alto, CA 94303, USA + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ************************************************************************/ + +#include "HelpCompiler.hxx" + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +class JarOutputStream +{ +private: + fs::path filename; + std::ostringstream perlline; +public: + JarOutputStream(); + void setname(const fs::path &name) { filename = name; } + const fs::path& getname() const { return filename; } + void addFile(const std::string &name, const std::string &key); + void addTree(const std::string &dir, const std::string &key); + void dontCompress(const std::string &key); + void commit(); +}; + +struct Data +{ + std::vector _idList; + typedef std::vector::const_iterator cIter; + + void append(const std::string &id) + { + _idList.push_back(id); + } + + std::string getString() const + { + std::string ret; + cIter aEnd = _idList.end(); + for (cIter aIter = _idList.begin(); aIter != aEnd; ++aIter) + ret += *aIter + ";"; + return ret; + } +}; + +class HelpKeyword +{ +private: + typedef std::hash_map DataHashtable; + DataHashtable _hash; + +public: + void insert(const std::string &key, const std::string &id) + { + Data &data = _hash[key]; + data.append(id); + } + + void dump(DB* table) + { + DataHashtable::const_iterator aEnd = _hash.end(); + for (DataHashtable::const_iterator aIter = _hash.begin(); aIter != aEnd; ++aIter) + { + const std::string &keystr = aIter->first; + DBT key; + memset(&key, 0, sizeof(key)); + key.data = const_cast(keystr.c_str()); + key.size = keystr.length(); + + const Data &data = aIter->second; + std::string str = data.getString(); + DBT value; + memset(&value, 0, sizeof(value)); + value.data = const_cast(str.c_str()); + value.size = str.length(); + + table->put(table, NULL, &key, &value, 0); + } + } +}; + +namespace PrefixTranslator +{ + std::string translatePrefix(const std::string &input) + { + if (input.find("vnd.sun.star.help://") == 0) + return std::string("#HLP#") + input.substr(strlen("vnd.sun.star.help://")); + else + return input; + } +} + +class IndexAccessor +{ + fs::path _dirName; +public: + IndexAccessor(const fs::path &dirName) : _dirName(dirName) {} + IndexAccessor(const IndexAccessor &another) { _dirName = another._dirName; } + fs::path indexFile(const std::string &name) const { return _dirName / name; } + std::ifstream* getLineInput(const std::string &name); + std::fstream* getOutputStream(const std::string &name); + std::vector readByteArray(const std::string &fileName); + void clear(); + std::fstream *getRAF(const std::string &name, bool update); + void createIfNeeded() {} +}; + +std::ifstream* IndexAccessor::getLineInput(const std::string &name) +{ + return new std::ifstream(indexFile(name).native_file_string().c_str()); +} + +std::fstream* IndexAccessor::getOutputStream(const std::string &name) +{ + return new std::fstream(indexFile(name).native_file_string().c_str(), std::ios::out | std::ios::trunc | std::ios::binary); +} + +std::vector IndexAccessor::readByteArray(const std::string &fileName) +{ + std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary); + std::vector ret(1024*16); + int i=0; + while (in.good()) + { + int len = in.readsome((char *)&ret[i], 1024*16); + if (!len) + break; + i += len; + ret.resize(i+1024*16); + } + ret.resize(i); + return ret; +} + +std::fstream* IndexAccessor::getRAF(const std::string &name, bool update) +{ + std::fstream *_file = new std::fstream; + fs::path fullname = indexFile(name); + if (!update) + { + _file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::binary); + } + else + { + _file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::binary); + if (!_file->is_open()) + { + HCDBG(std::cerr << "didn't exist" << std::endl); + _file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary); + } + if (!_file->is_open()) + std::cerr << "Cannot open " << name << std::endl; + } + return _file; +} + +void IndexAccessor::clear() +{ +#if 0 + File thisDir = indexFile("."); + File[] components = thisDir.listFiles(); + if (components != null) + for (int i = 0; i < components.length; i++) + components[i].delete(); +#endif +} + +typedef std::vector< std::string > VectorLines; + +class Schema : public IndexAccessor +{ +private: + static std::string PartName; + bool _update; + VectorLines _lines; +public: + Schema(const IndexAccessor &index, bool update); + std::ifstream* getSchemaLineInput() { return getLineInput(PartName); } + void read(); + Stringtable parameters(const std::string &name) const; + void update(const std::string &partName, const std::string ¶meters); + void save(); +}; + +std::string Schema::PartName = "SCHEMA"; + + +class startsWith +{ +public: + startsWith(const std::string &in) : str(in) {} + bool operator() ( const std::string &in ) const { return (in.find(str) == 0); } +private: + const std::string &str; +}; + +void Schema::update(const std::string &partName, const std::string &inparameters) +{ + VectorLines::iterator aEnd = std::remove_if(_lines.begin(), _lines.end(), startsWith(partName)); + if (aEnd != _lines.end()) _lines.erase(aEnd, _lines.end()); + _lines.push_back(partName + " " + inparameters); +} + +Stringtable Schema::parameters(const std::string &name) const +{ + Stringtable result; + VectorLines::const_iterator aEnd = _lines.end(); + for (VectorLines::const_iterator aIter = _lines.begin(); aIter != aEnd; ++aIter) + { + if (aIter->find(name) == 0) + { + boost::char_separator sep(" ="); + boost::tokenizer< boost::char_separator > tokens(name, sep); + boost::tokenizer< boost::char_separator >::const_iterator it = tokens.begin(); + ++it; // skip name + while(it != tokens.end()) + { + const std::string &part1 = *it; + ++it; + if (it == tokens.end()) + break; + const std::string &part2 = *it; + result[part1] = part2; + ++it; + } + break; + } + } + return result; +} + +Schema::Schema(const IndexAccessor &index, bool inupdate) : IndexAccessor(index), + _update(inupdate) +{ + read(); +} + +#ifdef UNX +#define MAX_LINE PATH_MAX +#else +#define MAX_LINE _MAX_PATH +#endif + +void Schema::read() +{ + std::ifstream* in = getSchemaLineInput(); + char line[MAX_LINE]; + // This needs to be replaced with our XML Parser + while (in->getline(line, MAX_LINE)) + _lines.push_back(line); + delete in; +} + +void Schema::save() +{ + if (_update) + { + std::fstream* out = getOutputStream(PartName); + *out << "JavaSearch 1.0\n"; + VectorLines::const_iterator aEnd = _lines.end(); + for (VectorLines::const_iterator aIter = _lines.begin(); aIter != aEnd; ++aIter) + *out << *aIter << '\n'; + delete out; + } +} + +class DBPartParameters +{ + Schema &_schema; + std::string _partName; + Stringtable _parameters; +protected: + bool parametersKnown() const; + void updateSchema(const std::string ¶meters) { _schema.update(_partName, parameters); } +public: + DBPartParameters(Schema &schema, const std::string &partName); + int integerParameter(const std::string &name); +}; + +DBPartParameters::DBPartParameters(Schema &schema, const std::string &partName) + : _schema(schema), _partName(partName) +{ + _parameters = schema.parameters(partName); +} + +bool DBPartParameters::parametersKnown() const +{ + return !_parameters.empty(); +} + +int DBPartParameters::integerParameter(const std::string &name) +{ + std::istringstream converter(_parameters[name]); + int ret; + converter >> ret; + return ret; +} + +class BlockManagerParameters : public DBPartParameters +{ +private: + fs::path _file; + int _blockSize; +protected: + int _root; +public: + BlockManagerParameters(Schema &schema, const std::string &partName); + bool readState(); + const fs::path& getFile() const { return _file; } + int getBlockSize() const { return _blockSize; } + void setBlockSize(int size) { _blockSize = size; } + int getRootPosition() const { return _root; } + void setRoot(int root) { _root = root; } + void updateSchema(const std::string ¶ms); +}; + +void BlockManagerParameters::updateSchema(const std::string ¶ms) +{ + std::ostringstream tmp; + tmp << "bs=" << _blockSize << " rt=" << _root << " fl=-1 " << params; + DBPartParameters::updateSchema(tmp.str()); +} + +BlockManagerParameters::BlockManagerParameters(Schema &schema, const std::string &partName) + : DBPartParameters(schema, partName), _root(0) +{ + _file = schema.indexFile(partName); + HCDBG(std::cerr << "file name set to " << _file.native_file_string()); + readState(); +} + +bool BlockManagerParameters::readState() +{ + if (parametersKnown()) + { + _blockSize = integerParameter("bs"); + _root = integerParameter("rt"); + return true; + } + else + return false; +} + +class BtreeDictParameters : public BlockManagerParameters +{ +private: + int _id1; +public: + BtreeDictParameters(Schema &schema, const std::string &partName); + int getFreeID() const { return _id1; } + void setFreeID(int id) { _id1 = id; } + void updateSchema(); +}; + +void BtreeDictParameters::updateSchema() +{ + std::ostringstream tmp; + tmp << "id1=" << _id1 << " id2=1"; + BlockManagerParameters::updateSchema(tmp.str()); +} + +BtreeDictParameters::BtreeDictParameters(Schema &schema, const std::string &partName) + : BlockManagerParameters(schema, partName) +{ +} + +int readInt(std::fstream &in) +{ + HCDBG(std::cerr << "want to read at " << in.tellg() << std::endl); + int ret = 0; + for (char i = 3; i >= 0; --i) + { + unsigned char byte; + in >> byte; + ret |= (byte << (i*8)); + HCDBG(fprintf(stderr, "inputting %x ret is now %x\n", byte, ret)); + } + return ret; +} + +void writeByte(std::fstream &out, unsigned char byte) +{ + out << byte; +} + +void writeShort(std::fstream &out, int item) +{ + for (int i = 1; i >= 0; --i) + { + unsigned char byte = static_cast((item >> (i*8))); + out << byte; + } +} + +void writeInt(std::fstream &out, int item) +{ + for (int i = 3; i >= 0; --i) + { + unsigned char byte = static_cast((item >> (i*8))); + HCDBG(fprintf(stderr, "outputting %x\n", byte)); + out << byte; + } +} + +void readFully(std::fstream &in, std::vector &_data) +{ + in.read((char*)(&_data[0]), _data.size()); +} + +/** + + Base class for (typically btree) blocks to hold either + byte vectors representing graph/tree edges, + or pairs (key, id) for dictionaries + + Each block has a header and a data section + + */ + +class Block +{ +public: + static int HEADERLEN; + // length of Block ID in bytes + static int IDLEN; + + // number of the block + // used for both referring to the block + // and addresssing the block in file + unsigned int _number; + bool _isLeaf; + // first available byte in data section + int _free; + std::vector _data; + + Block(int blocksize) : _number(0), _isLeaf(true), _free(0) + { + _data.resize(blocksize - HEADERLEN); + } + + virtual ~Block() {} + + void setBlockNumber(int n) { _number = n; } + virtual void setFree(int free) { _free = free; } + // interpret 4 bytes at 'i' as an integer + int integerAt(int i) const + { + int result = ((((((_data[i]&0xFF)<<8) + |_data[i+1]&0xFF)<<8) + |_data[i+2]&0xFF)<<8) + |_data[i+3]&0xFF; + return result; + } + void setIntegerAt(int i, int value) + { + /* + for (int j = i + 3; j >= i; j--, value >>= 8) + _data[j] = (unsigned char)(value & 0xFF); + */ + _data[i++] = (unsigned char)((value >> 24) & 0xFF); + _data[i++] = (unsigned char)((value >> 16) & 0xFF); + _data[i++] = (unsigned char)((value >> 8) & 0xFF); + _data[i] = (unsigned char)(value & 0xFF); + } + void readIn(std::fstream &in) + { + _number = readInt(in); + int twoFields = readInt(in); + _isLeaf = (twoFields & 0x80000000) != 0; + HCDBG(std::cerr << "read leaf as " << _isLeaf << std::endl); + _free = twoFields & 0x7FFFFFFF; + readFully(in, _data); + } + void writeOut(std::fstream &out) const + { + writeInt(out, _number); + writeInt(out, _free | (_isLeaf ? 0x80000000 : 0)); + out.write((char*)(&_data[0]), _data.size()); + } +}; + +int Block::HEADERLEN = 8; +// length of Block ID in bytes +int Block::IDLEN = 4; + +class BtreeDict; +class EntryProcessor; +typedef std::vector IntegerArray; + +class DictBlock : public Block +{ +public: + DictBlock(); + int free() const { return _free + firstEntry(); } + int numberOfEntries() const { return integerAt(0); } + int nthPointer(int n) const { return integerAt(4*(n + 1)); } + int getChildIdx(int index) const; + int entryKeyLength(int i) const { return _data[i] & 0xFF; } + int entryCompression(int i) const { return _data[i + 1] & 0xFF; } + int entryID(int i) const { return integerAt(i + 2); } + int entryLength(int entry) const; + int entryKey(int entry) const; + int firstEntry() const { return 4; } + int nextEntry(int entry) const { return entry + entryLength(entry); } + void restoreKeyInBuffer(int entry, std::vector &buffer); + std::string restoreKey(int entry, std::vector &buffer); + std::string findID(int id); + void setBlockNumbers(std::vector &blocks) const; + void listBlock(); + void doMap(BtreeDict &owner, const EntryProcessor &processor); + void withPrefix(BtreeDict &owner, const std::string &prefix, + size_t prefLen, IntegerArray &result); +}; + +class BlockFactory; + +class BlockProcessor; + +class BlockDescriptor +{ +public: + Block *_block; + bool _modf; + BlockDescriptor(Block *block) : _block(block), _modf(false) {} +}; // end of BlockDescriptor + +class BlockManager +{ +private: + static int INCR; + std::fstream _file; + long _blockSize; + bool _update; + BlockFactory *_blockFactory; + std::vector _blockTab; +public: + BlockManager(const BlockManagerParameters *params, + bool update, BlockFactory *bfactory); + ~BlockManager(); + Block& accessBlock(int blockNumber); + void setModified(int blNum); + void close(); + Block& getNewBlock(); + void processBlocks(BlockProcessor &processor); + void mapBlock(Block* block); + void addDescriptor(Block* block); +private: + void writeBlock(const Block &bl); +}; + +int BlockManager::INCR = 64; // size increment + +class EntryProcessor +{ +public: + virtual void processEntry(const std::string &string, int id) const = 0; + virtual ~EntryProcessor() {}; +}; + +class BtreeDict +{ +public: + static int ENTHEADERLEN; + static int BLOCKSIZE; + static int DATALEN; + static int MaxKeyLength; + static int lastPtrIndex; +protected: + BlockManager *blockManager; + int root; + std::vector blocks; + + BtreeDict() {/*empty*/} + ~BtreeDict() { delete blockManager; } + BtreeDict(const BtreeDictParameters *params); + void init(const BtreeDictParameters *params, bool update, + BlockFactory *bfactory); +public: + int fetch(const std::string &key); + void close(); +private: + std::string fetch(int conceptID); + IntegerArray withPrefix(const std::string &prefix); +public: + DictBlock& accessBlock(int index); + DictBlock& child(const DictBlock &bl, int index); +private: + std::string findID(int blNum, int id); + int find(const DictBlock &bl, std::vector &key, int index); + int find(const DictBlock &bl, std::vector &key); + void setBlocks(std::vector &blocks); + void map(const EntryProcessor &processor); +public: + void dumpnode(DictBlock &bl, int level); +}; + +class BlockFactory +{ +public: + virtual Block* makeBlock() const = 0; + virtual ~BlockFactory() {} +}; + +static int dictcount; + +class DictBlockFactory : public BlockFactory +{ +public: + Block* makeBlock() const + { + dictcount++; + return new DictBlock; + } +}; + +BtreeDict::BtreeDict(const BtreeDictParameters *params) +{ + init(params, false, new DictBlockFactory()); + blocks.resize(params->getFreeID()); + setBlocks(blocks); +} + +void BtreeDict::dumpnode(DictBlock &bl, int level) +{ + if (!bl._isLeaf) + { + fprintf(stderr, "\n"); + for (int i = 0; i < level; ++i) + fprintf(stderr, "\t"); + fprintf(stderr, "there are %d entries\n", bl.numberOfEntries()); + for (int i = 0; i < level; ++i) + fprintf(stderr, "\t"); + for (int i = 0; i < bl.numberOfEntries(); ++i) + { + int index = bl.getChildIdx(i); + fprintf(stderr, " %d ", index); + DictBlock &thischild = accessBlock(index); + dumpnode(thischild, level + 1); + } + fprintf(stderr, "\n"); + } +} + +int BtreeDict::fetch(const std::string &key) +{ + HCDBG(std::cerr << "fetching " << key << " from root " << root << std::endl); + DictBlock &bl = accessBlock(root); + + int length = key.size(); + std::vector Key(length + 1); + memcpy(&(Key[0]), key.c_str(), length); + Key[length] = 0; // sentinel + + return find(bl, Key); +} + +std::string BtreeDict::fetch(int conceptID) +{ + return findID(blocks[conceptID], conceptID); +} + +IntegerArray BtreeDict::withPrefix(const std::string &prefix) +{ + IntegerArray result; + accessBlock(root).withPrefix(*this, prefix, prefix.size(), result); + return result; +} + +void BtreeDict::close() +{ + blockManager->close(); +} + +void BtreeDict::init(const BtreeDictParameters *params, bool update, + BlockFactory *bfactory) +{ + blockManager = new BlockManager(params, update, bfactory); + root = params->getRootPosition(); +} + +DictBlock& BtreeDict::accessBlock(int index) +{ + return (DictBlock&)blockManager->accessBlock(index); +} + +DictBlock& BtreeDict::child(const DictBlock &bl, int index) +{ + if (bl._isLeaf) + { + std::cerr << "leaf's can't have children, screwed!" << std::endl; + exit(-1); + } + return accessBlock(bl.getChildIdx(index)); +} + +std::string BtreeDict::findID(int blNum, int id) +{ + return accessBlock(blNum).findID(id); +} + +int BtreeDict::find(const DictBlock &bl, std::vector &key, int index) +{ + HCDBG(std::cerr << "find2: " << bl._isLeaf << " : " << index << " : " << std::endl); + + return bl._isLeaf ? 0 : find(child(bl, index), key); +} + +int BtreeDict::find(const DictBlock &bl, std::vector &key) +{ + int inputKeyLen = key.size() - 1; + int entryPtr = bl.firstEntry(); + int freeSpace = bl.free(); + int nCharsEqual = 0; + int compression = 0; + + HCDBG(std::cerr << "find1: " << inputKeyLen << " : " + << entryPtr << " : " << freeSpace << " : " << nCharsEqual << " " + << compression << std::endl); + + for (int entryIdx = 0;;) + { + if (entryPtr == freeSpace) + return find(bl, key, bl.numberOfEntries()); + else if (compression == nCharsEqual) + { + int keyLen = bl.entryKeyLength(entryPtr); + int keyPtr = bl.entryKey(entryPtr), i; + for (i = 0; i < keyLen && key[nCharsEqual] == bl._data[keyPtr + i]; i++) + ++nCharsEqual; + if (i == keyLen) + { + if (nCharsEqual == inputKeyLen) + return bl.entryID(entryPtr); + } + else if ((key[nCharsEqual]&0xFF) < (bl._data[keyPtr + i]&0xFF)) + return find(bl, key, entryIdx); + } + else if (compression < nCharsEqual) // compression dropped + return find(bl, key, entryPtr == freeSpace + ? bl.numberOfEntries() : entryIdx); + do + { + entryPtr = bl.nextEntry(entryPtr); + ++entryIdx; + } + while (bl.entryCompression(entryPtr) > nCharsEqual); + compression = bl.entryCompression(entryPtr); + } +} + +class BlockProcessor +{ +protected: + std::vector &blocks; +public: + BlockProcessor(std::vector &_blocks) : blocks(_blocks) {} + virtual void process(const Block &block) = 0; + virtual ~BlockProcessor() {} +}; + + +class DictBlockProcessor : public BlockProcessor +{ +public: + DictBlockProcessor(std::vector &_blocks) : BlockProcessor(_blocks) {} + void process(const Block &block) + { + ((const DictBlock&)block).setBlockNumbers(blocks); + } +}; + +BlockManager::BlockManager(const BlockManagerParameters *params, + bool update, BlockFactory *bfactory) + : _blockFactory(bfactory) +{ + _update = update; + // params.readState(); + _blockSize = params->getBlockSize(); + HCDBG(std::cerr << "opening " << params->getFile().native_file_string() << std::endl); + if (!update) + { + _file.open(params->getFile().native_file_string().c_str(), std::ios::in | std::ios::binary); + } + else + { + _file.open(params->getFile().native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::binary); + if (!_file.is_open()) + { + HCDBG(std::cerr << "didn't exist" << std::endl); + _file.open(params->getFile().native_file_string().c_str(), + std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary); + } + if (!_file.is_open()) + std::cerr << "Cannot open " << params->getFile().native_file_string(); + } + + _file.seekg(0, std::ios::end); + long length = _file.tellg(); + if (length < 0) length = 0; + _file.seekg(0, std::ios::beg); + _file.clear(); + + HCDBG(std::cerr << "len is " << length << std::endl); + + if (length <= 0 && update) + { + Block* _dummy = bfactory->makeBlock(); + _dummy->setBlockNumber(0); + writeBlock(*_dummy); + delete _dummy; + length = _blockSize; + } + + _file.seekg(0, std::ios::beg); + + int _blockTableSize = (length/_blockSize); + HCDBG(std::cerr << "len is now " << _blockTableSize << std::endl); + for (int i = 0; i < _blockTableSize; ++i) + mapBlock(bfactory->makeBlock()); +} + +Block& BlockManager::getNewBlock() +{ + unsigned int number = _blockTab.size(); + + Block *bl = _blockFactory->makeBlock(); + bl->setBlockNumber(number); + writeBlock(*bl); + addDescriptor(bl); + + return *(_blockTab[number]._block); +} + +void BlockManager::setModified(int blNum) +{ + _blockTab[blNum]._modf = true; +} + +void BlockManager::close() +{ + if (_update) + { + std::vector::const_iterator aEnd = _blockTab.end(); + for (std::vector::const_iterator aIter = _blockTab.begin(); + aIter != aEnd; ++aIter) + { + if (aIter->_modf) + writeBlock(*(aIter->_block)); + } + } + _file.close(); +} + +void BlockManager::processBlocks(BlockProcessor &processor) +{ + std::vector::const_iterator aEnd = _blockTab.end(); + for (std::vector::const_iterator aIter = _blockTab.begin(); + aIter != aEnd; ++aIter) + { + processor.process(*(aIter->_block)); + } +} + +void BlockManager::mapBlock(Block* block) +{ + block->readIn(_file); + addDescriptor(block); +} + +void BlockManager::addDescriptor(Block *block) +{ + BlockDescriptor desc(block); + _blockTab.push_back(desc); + HCDBG(fprintf(stderr, "numbers are %lx %lx\n", block->_number, _blockTab.size() - 1)); + if (block->_number != _blockTab.size() - 1) + { + std::cerr << "totally screwed" << std::endl; + exit(-1); + } + HCDBG(std::cerr << "addDescriptor blocks are now " << _blockTab.size() << std::endl); +} + +void BlockManager::writeBlock(const Block &bl) +{ + _file.seekp(_blockSize * bl._number); + bl.writeOut(_file); +} + +Block& BlockManager::accessBlock(int blockNumber) +{ + return *(_blockTab[blockNumber]._block); +} + +BlockManager::~BlockManager() +{ + std::vector::iterator aEnd = _blockTab.end(); + for (std::vector::iterator aIter = _blockTab.begin(); + aIter != aEnd; ++aIter) + { + delete aIter->_block; + } + delete _blockFactory; +} + +void BtreeDict::setBlocks(std::vector &inblocks) +{ + DictBlockProcessor foo(inblocks); + blockManager->processBlocks(foo); +} + +// can go to Full +void BtreeDict::map(const EntryProcessor &processor) +{ + accessBlock(root).doMap(*this, processor); +} + +void DictBlock::restoreKeyInBuffer(int entry, std::vector &buffer) +{ + int howMany = entryKeyLength(entry); + int where = entryCompression(entry); + int from = entryKey(entry); + while (howMany-- > 0) + buffer[where++] = _data[from++]; +} + +std::string DictBlock::restoreKey(int entry, std::vector &buffer) +{ + int howMany = entryKeyLength(entry); + int where = entryCompression(entry); + int from = entryKey(entry); + while (howMany-- > 0) + buffer[where++] = _data[from++]; + return std::string((const char*)(&buffer[0]), 0, where); +} + +std::string DictBlock::findID(int id) +{ + std::vector buffer(BtreeDict::MaxKeyLength); + int freeSpace = free(); + for (int ent = firstEntry(); ent < freeSpace; ent = nextEntry(ent)) + { + if (entryID(ent) == id) // found + return restoreKey(ent, buffer); + else + restoreKeyInBuffer(ent, buffer); + } + std::cerr << "ID not found in block" << std::endl; + exit(-1); +} + +void DictBlock::setBlockNumbers(std::vector &blocks) const +{ + for (int e = firstEntry(); e < _free; e = nextEntry(e)) + blocks[entryID(e)] = _number; +} + +void DictBlock::listBlock() +{ + std::vector buffer(BtreeDict::MaxKeyLength); + int freeSpace = free(); + int entryPtr = firstEntry(); + if (_isLeaf) + { + while (entryPtr < freeSpace) + { + std::cout << restoreKey(entryPtr, buffer) << " " << + entryID(entryPtr); + entryPtr = nextEntry(entryPtr); + } + } + else + std::cout << "not leaf" << std::endl; +} + +void DictBlock::doMap(BtreeDict &owner, const EntryProcessor &processor) +{ + std::vector buffer(BtreeDict::MaxKeyLength); + int freeSpace = free(); + int entryPtr = firstEntry(); + if (_isLeaf) + { + while (entryPtr < freeSpace) + { + processor.processEntry(restoreKey(entryPtr, buffer), + entryID(entryPtr)); + entryPtr = nextEntry(entryPtr); + } + } + else + { + int entryIdx = 0; + while (entryPtr < freeSpace) + { + owner.accessBlock(getChildIdx(entryIdx)).doMap(owner,processor); + processor.processEntry(restoreKey(entryPtr, buffer), + entryID(entryPtr)); + entryPtr = nextEntry(entryPtr); + ++entryIdx; + } + owner.accessBlock(getChildIdx(entryIdx)).doMap(owner, processor); + } +} + +void DictBlock::withPrefix(BtreeDict &owner, const std::string &prefix, + size_t prefLen, IntegerArray &result) +{ + std::vector buffer(BtreeDict::MaxKeyLength); + int freeSpace = free(); + int entryPtr = firstEntry(); + if (_isLeaf) + { + while (entryPtr < freeSpace) + { + if (restoreKey(entryPtr, buffer).find(prefix) == 0) + result.push_back(entryID(entryPtr)); + entryPtr = nextEntry(entryPtr); + } + } + else + { + int entryIndex = 0; + while (entryPtr < freeSpace) + { + std::string key = restoreKey(entryPtr, buffer); + if (key.size() > prefLen) + key = key.substr(0, prefLen); + int cmp = key.compare(prefix); + if (cmp < 0) + { + entryPtr = nextEntry(entryPtr); + ++entryIndex; + } + else if (cmp == 0) + { + result.push_back(entryID(entryPtr)); + owner.accessBlock(getChildIdx(entryIndex)).withPrefix(owner, prefix, prefLen, result); + entryPtr = nextEntry(entryPtr); + ++entryIndex; + } + else + { + owner.accessBlock(getChildIdx(entryIndex)).withPrefix(owner, prefix, prefLen, result); + return; + } + } + owner.accessBlock(getChildIdx(numberOfEntries())).withPrefix(owner, prefix, prefLen, result); + } +} + +int BtreeDict::ENTHEADERLEN = 6; +int BtreeDict::BLOCKSIZE = 2048; +int BtreeDict::DATALEN = BtreeDict::BLOCKSIZE - Block::HEADERLEN; +int BtreeDict::MaxKeyLength = 255; + //!!! Careful with that number, Eugene +int BtreeDict::lastPtrIndex = 508; + +DictBlock::DictBlock() : Block(BtreeDict::BLOCKSIZE) +{ +} + +int DictBlock::getChildIdx(int index) const +{ + return nthPointer(BtreeDict::lastPtrIndex - index); +} + +int DictBlock::entryLength(int entry) const +{ + return BtreeDict::ENTHEADERLEN + entryKeyLength(entry); +} + +int DictBlock::entryKey(int entry) const +{ + return entry + BtreeDict::ENTHEADERLEN; +} + +void setBlockNumber2(std::vector &blocks, size_t index, int number) +{ + if (index >= blocks.size()) + blocks.resize(index + 1000); + blocks[index] = number; +} + +class Entry +{ +public: + std::vector key; + int id; + int block; + + Entry(const std::vector &keyin, int length, int idin) : key(length+1), id(idin), block(-1) + { + memcpy(&key[0], &keyin[0], length); + } + + Entry(const std::string &keyin, int idin) : key(keyin.size()+1), id(idin), block(-1) + { + memcpy(&key[0], keyin.c_str(), keyin.size()); + } + + bool smallerThan(const Entry &other) + { + for (size_t i = 0; i < std::min(key.size(), other.key.size()); i++) + if (key[i] != other.key[i]) + return (key[i]&0xFF) < (other.key[i]&0xFF); + return false; + } +}; // end of internal class Entry + +class FullDictBlock; + +class FullBtreeDict : public BtreeDict +{ +protected: + BtreeDictParameters *_params; + bool update; +public: + FullBtreeDict(BtreeDictParameters ¶ms, bool update); + void store(const std::string &bla, int id); + boost::shared_ptr insert(FullDictBlock &bl, boost::shared_ptr ent); + boost::shared_ptr insertHere(FullDictBlock &bl, boost::shared_ptr ent); + FullDictBlock& getNewBlock(); + void setModified(Block &bl); + void close(int freeID); +}; + +class FullDictBlock : public DictBlock +{ +public: + virtual void setFree(int free); + void setNumberOfEntries(int n) { setIntegerAt(0, n); } + void setChildIndex(int index, int value) + { + setIntegerAt(4*(BtreeDict::lastPtrIndex - index + 1), value); + } + void setEntryID(int i, int id) { setIntegerAt(i + 2, id); } + void setBlockNumbers(std::vector &blocks) const; + bool insert(const Entry &entry); + void makeEntry(int entry, const std::vector &key, int id, int length, int compr); + bool insert(const Entry &ent, int entryPtr, int compr1, int compr2, int index); + int insertInternal(const Entry &entry); + boost::shared_ptr split(FullDictBlock &newbl); + void initInternal(int leftBlock, const Entry &entry); + bool insert(boost::shared_ptr entry); + bool insert(boost::shared_ptr ent, int entryPtr, + int compr1, int compr2, int index); + +}; + +void FullDictBlock::initInternal(int leftBlock, const Entry &entry) +{ + _isLeaf = false; + setNumberOfEntries(1); + setChildIndex(0, leftBlock); + setChildIndex(1, entry.block); + int ent = firstEntry(); + makeEntry(ent, entry.key, entry.id, entry.key.size() - 1, 0); + setFree(nextEntry(ent)); +} + +void FullDictBlock::setFree(int infree) +{ + _free = infree - firstEntry(); + _data[infree] = _data[infree + 1] = 0; // sentinel +} + +boost::shared_ptr FullDictBlock::split(FullDictBlock& newbl) +{ + std::vector buffer(BtreeDict::MaxKeyLength); + int freeSpace = free(); + int half = freeSpace/2; + int index = 0; // of middle entry + newbl._isLeaf = _isLeaf; + int ent; + for (ent = firstEntry(); ent < half; ent = nextEntry(ent)) + { + restoreKeyInBuffer(ent, buffer); + ++index; + } + int entriesToMove = numberOfEntries() - index - 1; + // middle entry + restoreKeyInBuffer(ent, buffer); + int len = entryKeyLength(ent) + entryCompression(ent); + boost::shared_ptr result(new Entry(buffer, len, entryID(ent))); + result->block = newbl._number; + int newFree = ent; + // rest goes to the new block + ent = nextEntry(ent); + restoreKeyInBuffer(ent, buffer); + len = entryKeyLength(ent) + entryCompression(ent); + int nptr = firstEntry(); + newbl.makeEntry(nptr, buffer, entryID(ent), len, 0); + ent = nextEntry(ent); + memmove(&(newbl._data[newbl.nextEntry(nptr)]), &(_data[ent]), freeSpace - ent); + newbl.setNumberOfEntries(entriesToMove); + newbl.setFree(newbl.nextEntry(nptr) + freeSpace - ent); + if (_isLeaf == false) // need to split pointers + { + int from = 4*(BtreeDict::lastPtrIndex - numberOfEntries() + 1); + int to = from + 4*(index + 1); + memmove(&(newbl._data[to]), &(_data[from]), 4*(entriesToMove + 1)); + } + // this entry will end here + setFree(newFree); + setNumberOfEntries(index); + return result; + //!!!remember updating ID -> string association +} + +void FullDictBlock::setBlockNumbers(std::vector &blocks) const +{ + for (int e = firstEntry(); e < _free; e = nextEntry(e)) + setBlockNumber2(blocks, entryID(e), _number); +} + +bool FullDictBlock::insert(boost::shared_ptr ent, int entryPtr, + int compr1, int compr2, int index) +{ + const std::vector &key = ent->key; + int keyLen = key.size() - 1 - compr1; + int freeSpace = free(); + // calculate how much space is needed to add the new entry + // first, how many bytes are needed for just the new entry + int demand = BtreeDict::ENTHEADERLEN + keyLen; + // adding an entry can increase compression in the following entry + + int increase = 0; + if (entryPtr < freeSpace) + if (entryCompression(entryPtr) < compr2) + increase = compr2 - entryCompression(entryPtr); + /* + std::cerr << "key " << key << std::endl; + std::cerr << "entryPtr " << entryPtr << std::endl; + std::cerr << "compr1 " << compr1) << std::endl; + std::cerr << "compr2 " << compr2) << std::endl; + std::cerr << "index " << index) << std::endl; + std::cerr << "demand " << demand) << std::endl; + std::cerr << "increase " << increase) << std::endl; + */ + // check if enough space is available + int limit = _isLeaf ? BtreeDict::DATALEN-2 : 4*(BtreeDict::lastPtrIndex-numberOfEntries()-1); + + if (freeSpace + demand - increase <= limit) // 2 for sentinel + { + if (entryPtr < freeSpace) + { + // need to shift extant entries forward + int toMove = increase > 0 ? entryPtr + BtreeDict::ENTHEADERLEN + increase : entryPtr; + // move entries + memmove(&(_data[toMove + demand - increase]), &(_data[toMove]), freeSpace - toMove); + + if (increase > 0) + { + // update header + unsigned char tmp = static_cast(increase); + _data[entryPtr] = _data[entryPtr] - tmp; + _data[entryPtr + 1] = _data[entryPtr + 1] + tmp; + // shift header + memmove(&(_data[entryPtr + demand]), &(_data[entryPtr]), BtreeDict::ENTHEADERLEN); + } + } + // now write the new entry in the space made above + makeEntry(entryPtr, key, ent->id, keyLen, compr1); + + if (_isLeaf == false) + { + int from = 4*(BtreeDict::lastPtrIndex - numberOfEntries() + 1); + memmove(&(_data[from - 4]), &(_data[from]), 4*(numberOfEntries() - index)); + setChildIndex(index + 1, ent->block); + } + setFree(freeSpace + demand - increase); + setNumberOfEntries(numberOfEntries() + 1); + + /* + System.err.println("------------list--------------"); + byte[] buffer = new byte[MaxKeyLength]; + final int freeSpace2 = free(); + int entryPtr2 = firstEntry(); + while (entryPtr2 < freeSpace2) + { + System.err.println(entryPtr2); + System.err.println(entryKeyLength(entryPtr2)); + System.err.println(entryCompression(entryPtr2)); + System.err.println(new String(_data, + entryKey(entryPtr2), + entryKeyLength(entryPtr2))); + System.err.println(restoreKey(entryPtr2, buffer)+" "+ + entryID(entryPtr2)); + entryPtr2 = nextEntry(entryPtr2); + } + System.err.println("------------end--------------"); + */ + return true; + } + else + return false; +} + +// finds the place and context +bool FullDictBlock::insert(boost::shared_ptr entry) +{ + const std::vector &inkey = entry->key; + int inputKeyLen = inkey.size() - 1; + int freeSpace = free(); + int entryPtr = firstEntry(); + int nCharsEqual = 0; + int prevNCEqual = 0; + int compression = 0; + + for (int entryIndex = 0;;) + { + if (entryPtr == freeSpace) + return insert(entry, entryPtr, nCharsEqual, 0, numberOfEntries()); + else if (compression == nCharsEqual) + { + int keyLen = entryKeyLength(entryPtr); + int keyPtr = entryKey(entryPtr), i; + prevNCEqual = nCharsEqual; + for (i = 0; i < keyLen && inkey[nCharsEqual] == _data[keyPtr + i]; i++) + ++nCharsEqual; + if (i == keyLen) + { + if (nCharsEqual == inputKeyLen) + { + HCDBG(std::cerr << "setting to " << entry->id << std::endl); + setEntryID(entryPtr, entry->id); + return true; + } + } + else if ((inkey[nCharsEqual]&0xFF) < (_data[keyPtr + i]&0xFF)) + return insert(entry, entryPtr, prevNCEqual, nCharsEqual, entryIndex); + } + else if (compression < nCharsEqual) // compression dropped + { + int index = entryPtr == freeSpace ? numberOfEntries() : entryIndex; + return insert(entry, entryPtr, nCharsEqual, compression, index); + } + do + { + entryPtr = nextEntry(entryPtr); + ++entryIndex; + } + while (entryCompression(entryPtr) > nCharsEqual); + compression = entryCompression(entryPtr); + } +} + +static int fulldictcount; + +class FullDictBlockFactory : public BlockFactory +{ +public: + Block* makeBlock() const + { + fulldictcount++; + return new FullDictBlock; + } +}; + +class FullDictBlockProcessor : public BlockProcessor +{ +public: + FullDictBlockProcessor(std::vector &_blocks) : BlockProcessor(_blocks) {} + void process(const Block &block) + { + ((const FullDictBlock&)block).setBlockNumbers(blocks); + } +}; + +FullBtreeDict::FullBtreeDict(BtreeDictParameters ¶ms, bool _update) : + _params(¶ms), update(_update) +{ + init(_params, update, new FullDictBlockFactory()); + HCDBG(std::cerr << "id is " << params.getFreeID() << std::endl); + blocks.resize(params.getFreeID()); + + FullDictBlockProcessor foo(blocks); + blockManager->processBlocks(foo); + /* + if (logging) + log = new FileWriter("/tmp/FullBtreeDict.log"); + */ +} + +void FullBtreeDict::setModified(Block &bl) +{ + blockManager->setModified(bl._number); +} + +FullDictBlock& FullBtreeDict::getNewBlock() +{ + FullDictBlock &nbl = (FullDictBlock&)blockManager->getNewBlock(); + setModified(nbl); + return nbl; +} + +boost::shared_ptr FullBtreeDict::insertHere(FullDictBlock &bl, boost::shared_ptr ent) +{ + setModified(bl); // to be modified in any case + if (bl.insert(ent)) + return boost::shared_ptr(); + else + { + FullDictBlock &nbl = getNewBlock(); + boost::shared_ptr middle = bl.split(nbl); + nbl.setBlockNumbers(blocks); + if ((middle->smallerThan(*ent) ? nbl : bl).insert(ent) == false) + { + std::cerr << "entry didn't fit into a freshly split block" << std::endl; + exit(-1); + } + return middle; + } +} + +void FullDictBlock::makeEntry(int entry, const std::vector &key, int id, int length, int compr) +{ + _data[entry] = static_cast(length); + _data[entry + 1] = static_cast(compr); + setEntryID(entry, id); + memmove(&(_data[entryKey(entry)]), &(key[compr]), length); +} + +int FullDictBlock::insertInternal(const Entry &entry) +{ + const std::vector &inkey = entry.key; + int inputKeyLen = inkey.size() - 1; + int entryPtr = firstEntry(); + int freeSpace = free(); + int nCharsEqual = 0; + int compression = 0; + + for (int entryIndex = 0;;) + { + if (entryPtr == freeSpace) + return numberOfEntries(); + else if (compression == nCharsEqual) + { + int i; + int keyLen = entryKeyLength(entryPtr); + int keyPtr = entryKey(entryPtr); + for (i = 0; i < keyLen && inkey[nCharsEqual] == _data[keyPtr + i]; i++) + ++nCharsEqual; + if (i == keyLen) + { + if (nCharsEqual == inputKeyLen) + { + setEntryID(entryPtr, entry.id); + return -1; + } + } + else if ((inkey[nCharsEqual]&0xFF) < (_data[keyPtr + i]&0xFF)) + return entryIndex; + } + else if (compression < nCharsEqual) // compression dropped + return entryPtr >= freeSpace ? numberOfEntries() : entryIndex; + + do + { + entryPtr = nextEntry(entryPtr); + ++entryIndex; + } + while (entryCompression(entryPtr) > nCharsEqual); + compression = entryCompression(entryPtr); + } +} + +/* + delegation to powerful primitives at the FullDictBlock level lets us + express the insertion algorithm very succintly here +*/ +boost::shared_ptr FullBtreeDict::insert(FullDictBlock &bl, boost::shared_ptr ent) +{ + if (bl._isLeaf) + ent = insertHere(bl, ent); + else + { + int index = bl.insertInternal(*ent); + if (index != -1) + { + ent = insert((FullDictBlock&)child(bl, index), ent); + if (ent.get()) + ent = insertHere(bl, ent); + } + } + return ent; +} + +void FullBtreeDict::store(const std::string &bla, int id) +{ + HCDBG(std::cerr << "storing " << bla << " id " << id << std::endl); + /* + if (logging) + log.write(key + " " + id + "\n"); + */ + std::string key = bla; + int length = key.size(); + while(key.size() >= 250) + key = bla.substr(--length); + + if (key.size() >= 250) + { + std::cerr << "token " << key << " too long" << std::endl; + exit(-1); + } + boost::shared_ptr aTemp(new Entry(key, id)); + FullDictBlock &rBlock = (FullDictBlock&)accessBlock(root); + boost::shared_ptr entry = insert(rBlock, aTemp); + if (entry.get()) + { + // new root; writing to params needed + FullDictBlock &nbl = getNewBlock(); + nbl.initInternal(root, *entry); + setBlockNumber2(blocks, entry->id, root = nbl._number); + _params->setRoot(root); + } +} + +void FullBtreeDict::close(int freeID) +{ + _params->setFreeID(freeID); + if (update) + _params->updateSchema(); + BtreeDict::close(); + /* + if (logging) + log.close(); + */ +} + +class ConceptLocation +{ +public: + int _concept; + int _begin; + int _end; +public: + ConceptLocation(int conceptID, int begin, int end); + static void sortByConcept(std::vector &array, int i1, int i2); + static void sortByPosition(std::vector &array, int i1, int i2); + int getConcept() const { return _concept; } + void setConcept(int concept) { _concept = concept; } + int getBegin() const { return _begin; } + int getEnd() const { return _end; } + int getLength() const { return _end - _begin; } + bool equals(const ConceptLocation &other) const + { + return _concept==other._concept&&_begin==other._begin&&_end==other._end; + } +}; + +class DocumentCompressor; + +class Index : public IndexAccessor +{ +protected: + typedef std::hash_map IndexHashtable; + bool _update; + IndexHashtable _cache; + Schema *_schema; +private: + BtreeDictParameters *_dictParams; + FullBtreeDict *_dict; + int _freeID; + std::fstream *_positionsFile; + std::fstream *_offsetsFile; + DocumentCompressor *_documentCompressor; + IntegerArray _concepts; + IntegerArray _offsets; + std::vector _allLists; // POSITIONS + void readDocumentsTable(const std::string &fileName); + void readOffsetsTables(const std::string &fileName); + void readPositions(); +protected: + IntegerArray _microIndexOffsets; + IntegerArray _documents; + IntegerArray _titles; + std::vector _positions; +private: + int _positionsCacheSize; + int _currentBatchOffset; + bool _allInCache; +protected: + virtual void writeOutOffsets(); +public: + Index(const fs::path &indexName, bool update); + virtual ~Index(); + void init(); + int intern(const std::string &name); + std::fstream& getPositionsFile(); + std::fstream& getOffsetsFile(); + DocumentCompressor& getDocumentCompressor(); + virtual void compress(int docID, int titleID, + std::vector &locations, + std::vector &extents); + void close(); +}; + +Index::Index(const fs::path &indexName, bool update) : IndexAccessor(indexName), + _update(update), _cache(256), _schema(NULL), _dictParams(NULL), _dict(NULL), _positionsFile(0), _offsetsFile(0), _documentCompressor(0), + _positionsCacheSize(0), _currentBatchOffset(0), _allInCache(false) +{ +} + +class CompressorIterator; +class Decompressor +{ +private: + static int BitsInByte; + static int NBits; + + int _readByte; + int _toRead; + int _path; + +protected: + virtual int getNextByte() = 0; + virtual void initReading() { _toRead = 0; _path = 0; } + +private: + int countZeroes(); + // reads 1 bit; returns non-0 for bit "1" + int read(); + +public: + int read(int kBits); + void beginIteration() { _path = 0; } + bool readNext(int k, CompressorIterator &it); + void decode(int k, IntegerArray &array); + void ascDecode(int k, IntegerArray &array); + int ascendingDecode(int k, int start, std::vector &array); + virtual ~Decompressor() {} +}; + +int Decompressor::BitsInByte = 8; +int Decompressor::NBits = 32; + +class ByteArrayDecompressor : public Decompressor +{ +private: + const std::vector *_array; + int _index; + int _index0; +public: + ByteArrayDecompressor(const std::vector *array, int index) { initReading(array, index); } + using Decompressor::initReading; + virtual void initReading(const std::vector *array, int index) + { + _array = array; + _index = _index0 = index; + Decompressor::initReading(); + } + int bytesRead() { return _index - _index0; } +protected: + int getNextByte() + { + int ret = (*_array)[_index] & 0xFF; + HCDBG(fprintf(stderr, "ByteArrayDecompressor::getNextByte of %d at index %d\n", ret, _index)); + _index++; + return ret; + } +}; + +class IndexInverter; + +class MicroIndex +{ +public: + static int RANGE; + static int NConcepts; +private: + int _currentRange; + int _documentNumber; + std::vector _concepts; + short _group; + short _ix; + IntegerArray _kTable; + IntegerArray _offsets; + IntegerArray _maxConcepts; + const std::vector *_data; + int _base; + int _limit; + int _nc; + ByteArrayDecompressor _decmp; +public: + MicroIndex(int documentNumber, const std::vector *positions, int index); + bool smallerThan(const MicroIndex &other) + { + return _currentRange < other._currentRange || + _currentRange == other._currentRange && + _documentNumber < other._documentNumber; + } + +private: + bool next() + { + if (_group <= _limit) + { + int shift, index; + if (_group > 0) + { + index = _base + _offsets[_group - 1]; + shift = _maxConcepts[_group - 1]; + } + else + { + index = _base; + shift = 0; + } + HCDBG(fprintf(stderr, "_data len is %lu\n", _data->size())); + + _decmp.initReading(_data, index); + _nc = _decmp.ascendingDecode(_kTable[_group*2], shift, _concepts); + HCDBG(std::cerr << "nc b set to " << _nc << std::endl); + if (_group < _limit) + { + HCDBG(fprintf(stderr, "maxconcept size is %lu group is %d\n", _maxConcepts.size(), _group)); + HCDBG(fprintf(stderr, "microindex concept index %d set to %d\n", _nc, _maxConcepts[_group])); + _concepts[_nc++] = _maxConcepts[_group]; + } + _currentRange = _concepts[_ix = 0]/RANGE; + _group++; + return true; + } + else + return false; + } + + void openDocumentIndex() + { + unsigned int kk = (*_data)[_base] & 0xFF; + HCDBG(std::cerr << "openDocumentIndex, kk is " << kk + << " base is " << _base << std::endl); + switch (kk >> 6) // get type + { + case 0: // single group, no extents + _decmp.initReading(_data, _base += 2); + _nc = _decmp.ascendingDecode(kk & 0x3F, 0, _concepts); + HCDBG(std::cerr << "nc a set to " << _nc << std::endl); + _currentRange = _concepts[_ix = 0]/RANGE; + _limit = 0; + _group = 1; + break; + case 2: // multi group, no extents + { + _decmp.initReading(_data, _base + 1); + _decmp.decode(kk & 0x3F, _kTable); + int last = _kTable.back(); + _kTable.pop_back(); + _decmp.ascDecode(last, _offsets); + last = _kTable.back(); + _kTable.pop_back(); + _decmp.ascDecode(last, _maxConcepts); + _base += 1 + _decmp.bytesRead(); + _limit = _maxConcepts.size(); + _group = 0; + next(); + } + break; + case 1: // single group, extents + case 3: // multi group, extents + std::cerr << "extents not yet implemented" << std::endl; + break; + } + } + +public: + bool process(IndexInverter &lists); +}; + +int MicroIndex::RANGE = 1024; +int MicroIndex::NConcepts = 16; + +class BitBuffer +{ +private: + static int InitSize; + static int NBits; + static int BitsInByte; + static int BytesInInt; + + int _avail; + unsigned int _word; + int _free; + int _size; + std::vector _array; + +public: + BitBuffer() : _avail(NBits), _word(0), _free(0), _size(InitSize) + { + _array.resize(InitSize); + } + + void close() + { + if (_avail < NBits) + store(_word << _avail); + else + _avail = 0; + } + + void write(std::fstream &out) const + { + for (int i = 0; i < _free - 1; i++) + writeInt(out, _array[i]); + unsigned int word = _array[_free - 1]; + int bytes = BytesInInt - _avail/BitsInByte; + int shift = NBits; + while (bytes-- > 0) + writeByte(out, static_cast((word >> (shift -= BitsInByte)) & 0xFF)); + } + + void clear() + { + _word = 0; + _avail = NBits; + _free = 0; + } + + int byteCount() { return _free*BytesInInt - _avail/BitsInByte; } + int bitCount() { return _free*NBits - _avail; } + + void setFrom(const BitBuffer &rhs) + { + _word = rhs._word; + _avail = rhs._avail; + if ((_free = rhs._free) > _size) + _array.resize(_size = rhs._free); + _array = rhs._array; + } +private: + void growArray(int newSize) + { + _array.resize(newSize); + _size = newSize; + } + + void store(unsigned int value) + { + if (_free == _size) + growArray(_size * 2); + HCDBG(fprintf(stderr, "store of %x to %d\n", (int)value, _free)); + _array[_free++] = value; + } + +public: + void append(int bit) + { + _word = (_word << 1) | bit; + if (--_avail == 0) + { + store(_word); + _word = 0; + _avail = NBits; + } + } + + void append(unsigned int source, int kBits) + { + if (kBits < _avail) + { + _word = (_word << kBits) | source; + _avail -= kBits; + } + else if (kBits > _avail) + { + int leftover = kBits - _avail; + store((_word << _avail) | (source >> leftover)); + _word = source; + _avail = NBits - leftover; + } + else + { + store((_word << kBits) | source); + _word = 0; + _avail = NBits; + } + } + + void concatenate(const BitBuffer &bb) + { + if (_size - _free < bb._free) + growArray(_free + bb._free + 1); + + if (_avail == 0) + { + memmove(&_array[_free], &bb._array[0], bb._free * sizeof(unsigned int)); + _avail = bb._avail; + _free += bb._free; + HCDBG(fprintf(stderr, "free bumped to %d\n", _free)); + } + else + { + int tp = _free - 1; // target + int sp = 0; // source + do + { + _array[tp] |= bb._array[sp] >> (NBits - _avail); + _array[++tp] = bb._array[sp++] << _avail; + } + while (sp < bb._free); + _free += bb._free; + if ((_avail += bb._avail) >= NBits) + { + _avail -= NBits; + _free--; + } + HCDBG(fprintf(stderr, "other free bumped to %d\n", _free)); + } + } +}; + +class Compressor +{ +private: + static int NBits; + static int BeginK; + BitBuffer _buffer; +public: + void write(std::fstream &out) const { _buffer.write(out); } + int byteCount() { return _buffer.byteCount(); } + void clear() { _buffer.clear(); } + void concatenate(const Compressor &other) { _buffer.concatenate(other._buffer); } + void encode(const IntegerArray &pos, int k); + void encode(const IntegerArray &pos, const IntegerArray &len, int k, int k2); + // k: starting value for minimization + int minimize(const IntegerArray &array, int startK); + int compressAscending(const IntegerArray &array); +}; + +void toDifferences(const IntegerArray &in, IntegerArray &out) +{ + if (out.size() < in.size()) + out.resize(in.size()); + if (in.empty()) + return; + out[0] = in[0]; + for (size_t i = 1; i < in.size(); ++i) + out[i] = in[i] - in[i - 1]; +} + +class IndexInverter +{ +private: + static int K; + std::vector _arrays; + int _minConcept; + int _limit; + IntegerArray _concepts; + IntegerArray _offsets; + Compressor _compr; + IntegerArray _diffs; + std::fstream *_mainFile; + // heap + int _heapSize; + std::vector _heap; + + Index &_index; + +public: + IndexInverter(Index &index) : _arrays(MicroIndex::RANGE), + _minConcept(0), _limit(MicroIndex::RANGE), + _mainFile(0), _heapSize(0), _index(index) {} + ~IndexInverter() + { + delete _mainFile; + for (int i = 0; i < _heapSize; i++) + { + HCDBG(fprintf(stderr, "deleting number %d\n", i)); + delete _heap[i]; + } + } + void invertIndex(int nDocuments, const IntegerArray µIndexOffsets) + { + _mainFile = _index.getOutputStream("DOCS"); + for (int i = 0; i < MicroIndex::RANGE; i++) + _arrays[i] = IntegerArray(); + + // read in the whole POSITIONS file + std::vector positions = _index.readByteArray("POSITIONS"); + // build heap + _heap.clear(); + _heap.resize(_heapSize = nDocuments); + for (int i = 0; i < nDocuments; i++) + _heap[i] = new MicroIndex(i, &positions, microIndexOffsets[i]); + for (int i = _heapSize/2; i >= 0; i--) + heapify(i); + // process till exhausted + while (!_heap.empty()) + if (_heap[0]->process(*this)) + heapify(0); + else if (_heapSize > 1) + { + delete _heap[0]; + _heap[0] = _heap[--_heapSize]; + heapify(0); + } + else + break; + // closing + flush(); + _mainFile->close(); + // compress index file + std::fstream *indexFile = _index.getOutputStream("DOCS.TAB"); + unsigned char byte = static_cast( + _compr.compressAscending(_concepts)); + *indexFile << byte; // write k + _compr.write(*indexFile); + _compr.clear(); + byte = static_cast(_compr.minimize(_offsets, K)); + *indexFile << byte; // write k + _compr.write(*indexFile); + indexFile->close(); + delete indexFile; + } + + short process(int documentNumber, std::vector &concepts, + int n, short start, bool firstTime) + { + if (firstTime && concepts[start] >= _limit) + flush(); + concepts[n] = _limit; // sentinel + HCDBG(fprintf(stderr, "size is %lu, n index is %d, and limit is %d, start is %d\n", + concepts.size(), n, _limit, start)); + while (concepts[start] < _limit) + { + HCDBG(fprintf(stderr, "array size is %lu, %d %d\n", _arrays.size(), concepts[start], _minConcept)); + _arrays[concepts[start++] - _minConcept].push_back(documentNumber); + } + return start; + } + +private: + void heapify(int i) + { + int r = (i + 1) << 1, l = r - 1; + int smallest = l < _heapSize && _heap[l]->smallerThan(*_heap[i]) ? l : i; + if (r < _heapSize && _heap[r]->smallerThan(*_heap[smallest])) + smallest = r; + if (smallest != i) + { + MicroIndex *temp = _heap[smallest]; + _heap[smallest] = _heap[i]; + _heap[i] = temp; + heapify(smallest); + } + } + + void flush() + { + for (int i = 0; i < MicroIndex::RANGE; ++i) + { + if (!_arrays[i].empty()) + { + toDifferences(_arrays[i], _diffs); + unsigned char byte = static_cast( + _compr.minimize(_diffs, K)); + *_mainFile << byte; // write k + _offsets.push_back(_compr.byteCount() + 1); + _compr.write(*_mainFile); + _concepts.push_back(_minConcept + i); + _arrays[i].clear(); + _diffs.clear(); + _compr.clear(); + } + } + _limit += MicroIndex::RANGE; + _minConcept += MicroIndex::RANGE; + } +}; + +int IndexInverter::K = 3; + +MicroIndex::MicroIndex(int documentNumber, const std::vector *positions, int index) + : _concepts(NConcepts + 1), _data(positions), _decmp(NULL, 0) +{ + _documentNumber = documentNumber; + _base = index; + openDocumentIndex(); +} + +bool MicroIndex::process(IndexInverter &lists) +{ + bool firstTime = true; + while (true) + { + short stop = lists.process(_documentNumber, _concepts, _nc, _ix, firstTime); + if (stop < _nc) + { + _currentRange = _concepts[_ix = stop]/RANGE; + return true; + } + else if (next()) + firstTime = false; + else + return false; + } +} + +void Index::close() +{ + /* + BtreeDictCompactor source = new BtreeDictCompactor(_dictParams, false); + + URL url = new URL("file", "", _indexDir + "compacted"); + BtreeDictParameters params = + new BtreeDictParameters(url, _dictParams.getBlockSize(), 0, _freeID); + source.compact(params); + URL tmapURL = new URL("file", "", _indexDir + "DICTIONARY"); + File tmap = new File(tmapURL.getFile()); + File compacted = new File(url.getFile()); + compacted.renameTo(tmap); + _dictParams.setRoot(params.getRootPosition()); + _dictParams.updateSchema(); + */ + _dict->close(_freeID); + if (_positionsFile) + { + delete _positionsFile; + _positionsFile = NULL; + } + + if (_update) + { + writeOutOffsets(); + _dictParams->setFreeID(_freeID); + _dictParams->updateSchema(); + _schema->save(); + IndexInverter inverter(*this); + inverter.invertIndex(_documents.size(), _microIndexOffsets); + } + if (_offsetsFile) + { + delete _offsetsFile; + _offsetsFile = NULL; + } +} + +void Index::init() +{ + bool indexExists = false; + if (_update) + { + createIfNeeded(); + _cache.clear(); + } + if (_schema) delete _schema; + _schema = new Schema(*this, _update); + + if (_dictParams) delete _dictParams; + _dictParams = new BtreeDictParameters(*_schema, "DICTIONARY"); + + if (_dictParams->readState() == false) + { + _dictParams->setBlockSize(2048); + _dictParams->setRoot(0); + _dictParams->setFreeID(1); + } + else + indexExists = true; + + if (_dict) delete _dict; + _dict = new FullBtreeDict(*_dictParams, _update); + + _freeID = _dictParams->getFreeID(); + + _documents.clear(); + if (indexExists) + { + // read in index parts + _allLists = readByteArray("DOCS"); + readDocumentsTable("DOCS.TAB"); + readOffsetsTables("OFFSETS"); + readPositions(); + } + else + { + _microIndexOffsets.clear(); + _titles.clear(); + } +} + +int Index::intern(const std::string &name) +{ + IndexHashtable::const_iterator aIter = _cache.find(name); + if (aIter != _cache.end()) + return aIter->second; + else + { + //Seeing as we always start off with an empty dictionary, + //our entries will always be in the _cache, so don't ever + //search the underlying dictionary +#if 0 + int id = _dict->fetch(name); + if (id == 0) + _dict->store(name, id = _freeID++); +#else + int id = _freeID++; + _dict->store(name, id); +#endif + _cache.insert(IndexHashtable::value_type(name, id)).first->second = id; + return id; + } +} + +std::fstream& Index::getPositionsFile() +{ + if (!_positionsFile) + _positionsFile = getRAF("POSITIONS", _update); + return *_positionsFile; +} + +std::fstream& Index::getOffsetsFile() +{ + if (!_offsetsFile) + _offsetsFile = getRAF("OFFSETS", _update); + return *_offsetsFile; +} + +class VectorBtreeParameters : public BlockManagerParameters +{ +private: + int _vectorLength; +public: + VectorBtreeParameters(Schema &schema, const std::string &partName) : + BlockManagerParameters(schema, partName) + { + _vectorLength = integerParameter("vl"); + } + + void updateSchema() + { + std::ostringstream tmp; + tmp << "vl=" << _vectorLength; + BlockManagerParameters::updateSchema(tmp.str()); + } + + VectorBtreeParameters(Schema &schema, const std::string &partName, int vecLen) + : BlockManagerParameters(schema, partName) + { + _vectorLength = vecLen; + } + + int getVectorLength() { return _vectorLength; } +}; + +enum outerbreak { dobreak, docontinue, donothing }; + +class VectorProcessor +{ + std::vector _vector; +public: + virtual bool processVector() = 0; + std::vector& getVectorBuffer() { return _vector; } + virtual ~VectorProcessor() {} +}; + +class VectorBlock; + +class VectorBtree +{ +protected: + VectorBlock *_root; + BlockManager *_blockManager; + VectorBtreeParameters *_params; + int _blockSize; +public: + int _maxEntries; + int _leafDataLimit; +protected: + int _vectorsOffset; + VectorBlock& accessBlock(int index); + VectorBtree() {/*empty*/} +public: + int _vecLen; + int vector(int index) const; + static int memcmp(const std::vector &v1, + const std::vector &v2, int i2, int n); + VectorBtree(VectorBtreeParameters *params); + ~VectorBtree() { delete _blockManager; } +}; + +class VectorBlockFactory : public BlockFactory +{ +private: + int _blockSize; +public: + VectorBlockFactory(int blockSize) : _blockSize(blockSize) {} + Block* makeBlock() const; +}; + +VectorBtree::VectorBtree(VectorBtreeParameters *params) +{ + _params = params; + _vecLen = params->getVectorLength(); + _blockSize = params->getBlockSize(); + _maxEntries=(_blockSize-Block::HEADERLEN-Block::IDLEN)/(_vecLen+Block::IDLEN); + if ((_maxEntries & 1) == 0) // needs to be odd + _maxEntries--; + + _leafDataLimit = _blockSize - _vecLen - Block::HEADERLEN - Block::IDLEN; + + _vectorsOffset = (_maxEntries + 1)*Block::IDLEN; + _blockManager = new BlockManager(_params, false, new VectorBlockFactory(_blockSize)); + _root = &(accessBlock(params->getRootPosition())); +} + +VectorBlock& VectorBtree::accessBlock(int index) +{ + return (VectorBlock&)_blockManager->accessBlock(index); +} + +int VectorBtree::memcmp(const std::vector &v1, + const std::vector &v2, int i2, int n) +{ + for (int i = 0; i < n; i++, i2++) + if (v1[i] != v2[i2]) + return (v1[i]&0xFF) - (v2[i2]&0xFF); + return 0; +} + +class VectorBlock : public Block +{ +public: + VectorBlock(int size) : Block(size) {} +protected: + int findIndex(const std::vector &key, const VectorBtree &tree) + { + int i = 0, j = _free - 1; + while (i <= j) + { + int k = (i + j)/2; + int test = VectorBtree::memcmp(key, _data, tree.vector(k),tree._vecLen); + // std::cerr << "k = " << k << ", test = " << test << std::endl; + if (test > 0) + i = k + 1; + else if (test < 0) + j = k - 1; + else + return -1 - k; // result always negative; "k" encoded + } + return i; + } +private: + int FindVectorsInLeaf(const std::vector &lo, + const std::vector &hi, int commLen, int prefLen, + std::vector &buffer, int size, const VectorBtree &tree) + { + int idx = 0, start; + for (int nBytesEq = 0;;) + { + // std::cout << "idx = " << idx << std::endl; + if (_data[idx] == nBytesEq) // at compression byte + { + int i; + outerbreak hack(donothing); + for (i = nBytesEq; i < tree._vecLen; i++) + { + if (lo[i] == _data[++idx]) + ++nBytesEq; + else if ((lo[i]&0xFF) < (_data[idx]&0xFF)) + if (nBytesEq >= commLen && (i >= prefLen || (hi[i]&0xFF) >= (_data[idx]&0xFF))) + { + start = nBytesEq; + hack = dobreak; + break; + } + else + return 0; + else + { + idx += tree._vecLen - i; // skip + hack = docontinue; + break; + } + } + + if (hack == dobreak) + break; + else if (hack == docontinue) + continue; + + if (i == tree._vecLen) // eq vec found + if ((_data[++idx]&0xFF) >= prefLen) + { + start = _data[idx++]&0xFF; + break; + } + else + return 0; + } + else if (_data[idx] < nBytesEq) // drop + { + std::cout << idx << std::endl; + nBytesEq = (_data[idx++]); + std::cout << nBytesEq << std::endl; + if (nBytesEq < commLen) + return 0; + else if (lo[nBytesEq] < (_data[idx]&0xFF)) + if (hi[nBytesEq] < (_data[idx]&0xFF)) + return 0; + else + { + start = nBytesEq; // found + break; + } + else + idx += tree._vecLen - nBytesEq; + } + else if ((_data[idx]&0xFF) == 0xFF) + return 0; + else // compression is bigger + idx += tree._vecLen + 1 - _data[idx]; + } + + int length = std::min(size - start, _free - idx); + buffer[0] = static_cast(start); + memcpy(&(buffer[1]), &(_data[idx]), length); + buffer[length + 1] = 0; + return length + 1; + } +protected: + bool searchLeafBlock(const std::vector &key, const VectorBtree &tree) + { +#if 0 + processLeafBlock(_printer); +#endif + int nBytesEq = 0; + for (int idx = 0;; idx += tree._vecLen + 1 - _data[idx]) + { + if (_data[idx] == nBytesEq) + { + int i, j; + outerbreak hack(donothing); + for (i = _data[idx], j = idx + 1; i < tree._vecLen; i++, j++) + { + if (key[i] == _data[j]) + ++nBytesEq; + else if ((key[i]&0xFF) < (_data[j]&0xFF)) + return false; + else /* key[i] > _data[j] */ + { + hack = dobreak; + break; + } + } + + if (hack == dobreak) + break; + + if (i == tree._vecLen) /* or nBytesEq == _vecLen */ + return true; /* equal vector found */ + } + else if (_data[idx] < nBytesEq) + return false; + } + return false; + } +public: + bool processLeafBlock(VectorProcessor &processor, const VectorBtree &tree) + { + std::vector &buffer = processor.getVectorBuffer(); + for (int ix = 0; ix < _free; ix += tree._vecLen - _data[ix] + 1) + { + // cmc: the below line was a comment in the original java, somewhere along + // the line I suspect this was written in c++, then into java + // and now I'm putting it back to c++ :-( + // ::memcpy(&buffer[_data[ix]], &_data[ix + 1], _vecLen - _data[ix]); + memcpy(&(buffer[_data[ix]]), &(_data[ix + 1]), tree._vecLen - _data[ix]); + if (processor.processVector()) + return true; + } + return false; + } +}; // VectorBlock + +Block* VectorBlockFactory::makeBlock() const +{ + return new VectorBlock(_blockSize); +} + +class FullVectorBlock : public VectorBlock +{ +public: + FullVectorBlock(int size) : VectorBlock(size) {} + bool isFull(const VectorBtree &tree) const + { + //return pbl->_leaf ? pbl->_free > _leafDataLimit : pbl->_free == _maxEntries; + return _isLeaf ? _free > tree._leafDataLimit : _free == tree._maxEntries; + } +}; + +class FullVectorBtree : public VectorBtree +{ +private: + static int MaxVeclen; + static double SplitRatio; +public: + FullVectorBtree(VectorBtreeParameters* params, bool update); + bool insertVector(const std::vector &key); +private: + bool treeInsertNonfull(const FullVectorBlock &bl, const std::vector &key); + bool treeInsertNonfullRoot(const std::vector &key); + FullVectorBlock& getNewBlock(); + void enableModif(const Block &bl); + void declareModif(const Block &bl); +public: + void close() { _blockManager->close(); } +}; + +int FullVectorBtree::MaxVeclen = 128; +double FullVectorBtree::SplitRatio = 0.5; + +class FullVectorBlockFactory : public BlockFactory +{ +private: + int _blockSize; +public: + FullVectorBlockFactory(int blockSize) : _blockSize(blockSize) {} + Block* makeBlock() const + { + return new FullVectorBlock(_blockSize); + } +}; + +FullVectorBtree::FullVectorBtree(VectorBtreeParameters *params, bool update) +{ + _params = params; + _vecLen = params->getVectorLength(); + _blockSize = params->getBlockSize(); + _blockManager = new BlockManager(params, update, new FullVectorBlockFactory(_blockSize)); + _maxEntries=(_blockSize-Block::HEADERLEN-Block::IDLEN)/(_vecLen+Block::IDLEN); + // System.out.println("_maxEntries = " + _maxEntries); + if ((_maxEntries & 1) == 0) // needs to be odd + _maxEntries--; + _leafDataLimit = _blockSize - _vecLen - Block::HEADERLEN - Block::IDLEN; + _vectorsOffset = (_maxEntries + 1)*Block::IDLEN; + _root = &(accessBlock(params->getRootPosition())); +} + +class CompressorIterator +{ +public: + virtual void value(int value) = 0; + virtual ~CompressorIterator() {} +}; + +int Decompressor::countZeroes() +{ + for (int count = 0;; _readByte = getNextByte(), _toRead = BitsInByte) + { + HCDBG(fprintf(stderr, "count is %d\n", count)); + HCDBG(fprintf(stderr, "Decompressor::countZeroes is %x\n", _readByte)); + HCDBG(fprintf(stderr, "_toRead is %d\n", _toRead)); + HCDBG(fprintf(stderr, "_readByte is %x\n", _readByte)); + while (_toRead-- > 0) + { + if ((_readByte & (1 << _toRead)) != 0) + { + HCDBG(fprintf(stderr, "returning count of %d\n", count)); + return count; + } + else + { + ++count; + HCDBG(fprintf(stderr, "int count to %d\n", count)); + } + } + } + //return 0; +} + +// reads 1 bit; returns non-0 for bit "1" +int Decompressor::read() +{ + if (_toRead-- > 0) + return _readByte & (1 << _toRead); + else + { // get next word + _toRead = BitsInByte - 1; + return (_readByte = getNextByte()) & 0x80; + } +} + +int Decompressor::read(int kBits) +{ + int shift = BitsInByte - _toRead; + if (kBits <= _toRead) + { + HCDBG(fprintf(stderr, "leg 1\n")); + return ((_readByte<> (shift + (_toRead-=kBits)); + } + else + { + HCDBG(fprintf(stderr, "leg 2 _readByte is %d, shift %d\n", _readByte, shift)); + int result = _toRead > 0 ? ((_readByte << shift) & 0xFF) >> shift : 0; + HCDBG(fprintf(stderr, "result is %d\n", result)); + for (kBits -= _toRead; kBits >= BitsInByte; kBits -= BitsInByte) + { + int foo = getNextByte(); + HCDBG(fprintf(stderr, "byte is %d\n", foo)); + result = (result << BitsInByte) | foo; + HCDBG(fprintf(stderr, "and result is %d\n", result)); + } + if (kBits > 0) + { + int foo = getNextByte(); + HCDBG(fprintf(stderr, "and byte is %d\n", foo)); + int thing = BitsInByte - kBits; + HCDBG(fprintf(stderr, "thing is %d\n", thing)); + _toRead = thing; + _readByte = foo; + int right = (_readByte >> _toRead); + HCDBG(fprintf(stderr, "right is %d\n", right)); + int left = result << kBits; + HCDBG(fprintf(stderr, "kbits are %d\n", kBits)); + HCDBG(fprintf(stderr, "left is %d\n", left)); + int ret = left | right; +// int ret = (result << kBits) | ((_readByte = foo) >> (_toRead = BitsInByte - kBits)); + HCDBG(fprintf(stderr, "and final is %d\n", ret)); + return ret; + } + else + { + _toRead = 0; + HCDBG(fprintf(stderr, "and this result says %d\n", result)); + return result; + } + } +} + +bool Decompressor::readNext(int k, CompressorIterator &it) +{ + if (read() != 0) + { + it.value(_path | read(k)); + return true; + } + else + { + for (int count = 1;; _readByte = getNextByte(), _toRead = BitsInByte) + { + while (_toRead-- > 0) + { + if ((_readByte & (1 << _toRead)) != 0) + { + int saved = _path; + _path = ((_path >> (k + count) << count) | read(count)) << k; + if (_path != saved) + { + it.value(_path | read(k)); + return true; + } + else + { + return false; + } + } + else + { + ++count; + } + } + } + } +} + +void Decompressor::decode(int k, IntegerArray &array) +{ + for (int path = 0;;) + { + if (read() != 0) + { + array.push_back(path | read(k)); + } + else + { + int count = countZeroes() + 1; + int saved = path; + path = ((path >> (k + count) << count) | read(count)) << k; + if (path != saved) // convention for end + array.push_back(path | read(k)); + else + break; + } + } +} + +void Decompressor::ascDecode(int k, IntegerArray &array) +{ + for (int path = 0, start = 0;;) + { + HCDBG(fprintf(stderr, "path is %d, start is %d\n", path, start)); + if (read() != 0) + { + int inread = read(k); + start += path | inread; + HCDBG(fprintf(stderr, "inread is %d\n", inread)); + int final = start; + HCDBG(fprintf(stderr, "1:Decompressor::ascDecode to %d\n", final)); + array.push_back(final); + } + else + { + int count = countZeroes() + 1; + HCDBG(fprintf(stderr, "count is %d\n", count)); + int saved = path; + int inread = read(count); + HCDBG(fprintf(stderr, "inread is %d, k is %d, path is %d\n", inread, + k, path)); + path = ((path >> (k + count) << count) | inread) << k; + if (path != saved) // convention for end + { + int anotherread = read(k); + HCDBG(fprintf(stderr, "newinread is %d\n", anotherread)); + start += path | anotherread; + int final = start; + HCDBG(fprintf(stderr, "2:Decompressor::ascDecode to %d\n", final)); + array.push_back(final); + } + else + { + break; + } + } + } +} + +int Decompressor::ascendingDecode(int k, int start, std::vector &array) +{ + int path = 0, index = 0; + while (true) + { + if (read() != 0) + array[index++] = (start += path | read(k)); + else + { + outerbreak hack = donothing; + for (int cnt = 0;; _readByte = getNextByte(), _toRead = BitsInByte) + { + while (_toRead-- > 0) + { + if ((_readByte & (1 << _toRead)) != 0) + { + ++cnt; + int Path = ((path >> (k + cnt) << cnt) | read(cnt)) << k; + if (Path != path) + { + array[index++] = (start += (path = Path) | read(k)); + hack = docontinue; + break; + } + else + return index; + } + else + ++cnt; + } + if (hack == docontinue) + break; + } + } + } +} + +class StreamDecompressor : public Decompressor +{ +private: + std::ifstream *_input; +public: + StreamDecompressor(std::ifstream &input) { initReading(input); } + using Decompressor::initReading; + virtual void initReading(std::ifstream &input) { _input = &input; Decompressor::initReading(); } + int getNextByte() + { + unsigned char ret; + *_input >> ret; + HCDBG(fprintf(stderr, "StreamDecompressor::getNextByte of %d\n", ret)); + return ret; + } +}; + +void Index::readPositions() +{ + getPositionsFile(); + //!!! temporary: better than fixed large value, worse than 'intelligent' size mgt + _positionsFile->seekg(0, std::ios::end); + _positionsCacheSize = _positionsFile->tellg(); + if (_positionsCacheSize < 0) _positionsCacheSize = 0; + _positionsFile->clear(); + _positionsFile->seekg(0, std::ios::beg); + + if (_positionsCacheSize <= _positionsCacheSize) + { + _allInCache = true; + _positions.resize(_positionsCacheSize); + _positionsFile->readsome((char*)(&_positions[0]), _positionsCacheSize); + std::cout << "POS fits in cache" << std::endl; + } +} + +void Index::readOffsetsTables(const std::string &fileName) +{ + std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary); + unsigned char k1; + in >> k1; + StreamDecompressor sddocs(in); + sddocs.decode(k1, _documents); + unsigned char k2; + in >> k2; + _microIndexOffsets.clear(); + StreamDecompressor sdoffsets(in); + sdoffsets.ascDecode(k2, _microIndexOffsets); + // decompress titles' ids table + unsigned char k3; + in >> k3; + _titles.clear(); + StreamDecompressor sdtitles(in); + sdtitles.decode(k3, _titles); +} + +void Index::readDocumentsTable(const std::string &fileName) +{ + std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary); + unsigned char k1; + in >> k1; + _concepts.clear(); + StreamDecompressor sddocs(in); + sddocs.ascDecode(k1, _concepts); + unsigned char k2; + in >> k2; + _offsets.clear(); + _offsets.push_back(0); + StreamDecompressor sdoffsets(in); + sdoffsets.ascDecode(k2, _offsets); + in.close(); +} + +class ContextTables; + +class Tables +{ +private: + std::vector _initialWordsCached; + std::vector _destsCached; + std::vector _linkTypesCached; + std::vector _seqNumbersCached; +public: + Tables(const std::vector &initialWords, + std::vector &dests, + std::vector &linkTypes, + std::vector &seqNumbers) + { + _initialWordsCached = initialWords; + _destsCached = dests; + _linkTypesCached = linkTypes; + _seqNumbersCached = seqNumbers; + } + void setTables(ContextTables &context); +}; // end of Tables + +class ContextTables +{ +public: + std::vector _initialWords; + std::vector _dests; + std::vector _linkTypes; + std::vector _seqNumbers; + int _nTextNodes; +private: + std::vector _cache; + // cached last position for linear search + int _initialWordsIndex; + // link names are shared between all microindexes in an index + std::vector _linkNames; + // offsets to tables' storage in file (or memory) + std::vector _offsets; + std::vector _contextData; // !!! fully cached for now + // auxillary + IntegerArray _kTable; + // _auxArray will be used as an auxillary to decode arrays + IntegerArray _auxArray; + int _lastDocNo; + + std::vector _markers; + +public: + ContextTables(const std::vector &offsets, const std::vector &contextData, + const std::vector &linkNames); + ~ContextTables(); + void setMicroindex(int docNo); + int parentContext(int context); + const std::string& linkName(int context); + int linkCode(const std::string &linkName); + std::vector getIgnoredElementsSet(const std::vector &ignoredElements); + bool notIgnored(int ctx, const std::vector &ignoredElements); + int firstParentWithCode(int pos, int linkCode); + int firstParentWithCode2(int pos, int linkCode, int parentCode); + int firstParentWithCode3(int pos, int linkCode, int ancestorCode); + int firstParentWithCode4(int pos, const std::vector &linkCodes); + int firstParentWithCode5(int pos, const std::vector &pathCodes); + int firstParentWithCode7(int pos, int linkCode, int seq); + bool isGoverning(int context) { return linkName(context) == "TITLE"; } + void resetContextSearch() { _initialWordsIndex = 0; } +private: + void appendSegment(int context, std::string &result); + int findIndexBin(int wordNumber); +public: + int wordContextLin(int wordNumber); +}; + +ContextTables::ContextTables(const std::vector &offsets, const std::vector &contextData, + const std::vector &linkNames) : _kTable(5), _auxArray(4096), _lastDocNo(-1) +{ + _offsets = offsets; + _contextData = contextData; + _linkNames = linkNames; + _cache.resize(_offsets.size()); +} + +ContextTables::~ContextTables() +{ + for (size_t i = 0; i < _cache.size(); ++i) + delete _cache[i]; +} + +void ContextTables::setMicroindex(int docNo) +{ + if (docNo != _lastDocNo) // check if we need to do anything + { + if (_cache[docNo]) + _cache[docNo]->setTables(*this); + else + { + int offset = _offsets[docNo]; + int k0 = _contextData[offset] & 0xFF; + HCDBG(fprintf(stderr, "_contextData len is %lu\n", _contextData.size())); + ByteArrayDecompressor compr(&_contextData, offset + 1); + _kTable.clear(); + compr.decode(k0, _kTable); + // decompress initialWords into auxiliary array + _auxArray.clear(); + compr.ascDecode(_kTable[0], _auxArray); // _initialWords + _initialWords = _auxArray; + _nTextNodes = _initialWords.size(); + // decompress destinations into auxiliary array + _auxArray.clear(); + compr.decode(_kTable[1], _auxArray); // _dests + _auxArray.push_back(-1); // sentinel, root + _dests = _auxArray; + _linkTypes.clear(); + compr.decode(_kTable[2], _linkTypes); + _seqNumbers.clear(); + compr.decode(_kTable[3], _seqNumbers); + + _cache[docNo] = new Tables(_initialWords, _dests, _linkTypes, _seqNumbers); + + /* + System.out.println("|_initialWords| = " + _nTextNodes); + System.out.println("|_dests| -1 = " + (_dests.length - 1)); + System.out.println("|_seqNumbers| = " + _seqNumbers.length); + System.out.println("|_linkTypes| = " + _linkTypes.length); + */ + } + _lastDocNo = docNo; + _markers.resize(_dests.size()); + } + _initialWordsIndex = 0; +} + +int ContextTables::parentContext(int context) +{ + return _dests[context]; +} + +const std::string& ContextTables::linkName(int context) +{ + return _linkNames[_linkTypes[context]]; +} + +int ContextTables::linkCode(const std::string &inlinkName) +{ + for (size_t i = 0; i < _linkNames.size(); i++) + if (inlinkName == _linkNames[i]) + return i; + return -1; // when not found +} + +std::vector ContextTables::getIgnoredElementsSet(const std::vector &ignoredElements) +{ + std::vector result; + bool noValidIgnoredElements = true; + if (!ignoredElements.empty()) + { + result.resize(_linkNames.size()); + for (size_t i = 0; i < ignoredElements.size(); i++) + { + int code = linkCode(ignoredElements[i]); + if (code > -1) + { + result[code] = true; + noValidIgnoredElements = false; + } + } + } + return noValidIgnoredElements ? std::vector() : result; +} + +bool ContextTables::notIgnored(int ctx, const std::vector &ignoredElements) +{ + do + { + if (ignoredElements[_linkTypes[ctx]]) + { + std::cout << "hit ignored" << std::endl; + return false; + } + } + while ((ctx = _dests[ctx]) > -1); // parentContext 'hand inlined' + return true; +} + +/** starting with ctx and going up the ancestry tree look for the first + context with the given linkCode */ +int ContextTables::firstParentWithCode(int pos, int inlinkCode) +{ + int ctx = _dests[wordContextLin(pos)]; // first parent of text node + int shift = _nTextNodes; + int limit = _dests.size() - 1; + while (_linkTypes[ctx - shift] != inlinkCode) + if ((ctx = _dests[ctx]) == limit) + return -1; + return ctx; +} + +/** starting with ctx and going up the ancestry tree look for the first + context with the given linkCode and given parent code */ +int ContextTables::firstParentWithCode2(int pos, int inlinkCode, int parentCode) +{ + int ctx = _dests[wordContextLin(pos)]; // first parent of text node + int shift = _nTextNodes; + int limit = _dests.size() - 1; + for (int parent = _dests[ctx]; parent < limit; parent = _dests[parent]) + if (_linkTypes[parent - shift] == parentCode && _linkTypes[ctx - shift] == inlinkCode) + return ctx; + else + ctx = parent; + return -1; +} + +/** starting with ctx and going up the ancestry tree look for the first + context with the given linkCode and given ancestor code */ +int ContextTables::firstParentWithCode3(int pos, int inlinkCode, int ancestorCode) +{ + int ctx = _dests[wordContextLin(pos)]; + int shift = _nTextNodes; + int limit = _dests.size() - 1; + // find first instance of linkCode + while (ctx < limit && _linkTypes[ctx - shift] != inlinkCode) + ctx = _dests[ctx]; + if (ctx < limit) // found linkCode, check ancestry + for (int ancestor = _dests[ctx]; + ancestor < limit; + ancestor = _dests[ancestor]) + if (_linkTypes[ancestor - shift] == ancestorCode) // ancestor confirmed + return ctx; // match found, return successful ctx + return -1; // match NOT found +} + +/** starting with ctx and going up the ancestry tree look for the first + context with any of the given linkCode */ +int ContextTables::firstParentWithCode4(int pos, const std::vector &linkCodes) +{ + int nCodes = linkCodes.size(); + int shift = _nTextNodes; + int limit = _dests.size() - 1; + for (int ctx = _dests[wordContextLin(pos)]; ctx < limit; ctx = _dests[ctx]) + { + int code = _linkTypes[ctx - shift]; + for (int i = 0; i < nCodes; i++) + if (code == linkCodes[i]) + return ctx; + } + return -1; +} + +/** starting with ctx and going up the ancestry tree look for the first + context with the given path */ +int ContextTables::firstParentWithCode5(int pos, const std::vector &pathCodes) +{ + int nCodes = pathCodes.size(); + int lastCode = pathCodes[nCodes - 1]; + int shift = _nTextNodes; + int limit = _dests.size() - 1; + int ctx = _dests[wordContextLin(pos)]; + for (int parent = _dests[ctx]; parent < limit; parent = _dests[parent]) + { + if (_linkTypes[ctx - shift] == lastCode) + { + // try to match the entire path + outerbreak hack = donothing; + for (int i = nCodes - 2, parent2 = parent; i >= 0; i--) + if (_linkTypes[parent2 - shift] != pathCodes[i]) // match failure + { + hack = docontinue; + break; // try to match higher + } + else if ((parent2 = _dests[parent2]) == limit) + return -1; + if (hack == docontinue) + continue; + return ctx; + } + else + ctx = parent; + } + return -1; +} + +/** starting with ctx and going up the ancestry tree look for the first + context with the given linkCode */ +int ContextTables::firstParentWithCode7(int pos, int inlinkCode, int seq) +{ + int ctx = _dests[wordContextLin(pos)]; // first parent of text node + int shift = _nTextNodes; + int limit = _dests.size() - 1; + while (_linkTypes[ctx - shift] != inlinkCode || _seqNumbers[ctx] != seq) + if ((ctx = _dests[ctx]) == limit) + return -1; + return ctx; +} + +void ContextTables::appendSegment(int context, std::string &result) +{ + result.append(context < _nTextNodes ? "text()" : _linkNames[_linkTypes[context - _nTextNodes]]); + result.push_back('['); + std::ostringstream tmp; + tmp << _seqNumbers[context]; + result.append(tmp.str()); + result.append("]/"); +} + +int ContextTables::findIndexBin(int wordNumber) +{ + int i = 0, j = _nTextNodes - 1; + while (i <= j) + { + int k = (i + j) >> 1; + if (_initialWords[k] < wordNumber) + i = k + 1; + else if (_initialWords[k] > wordNumber) + j = k - 1; + else + return k; + } + return i - 1; +} + +int ContextTables::wordContextLin(int wordNumber) +{ + for (int i = _initialWordsIndex; i < _nTextNodes; i++) + if (_initialWords[i] > wordNumber) // first such i + { + // - 1 if wordNumbers can be the same + _initialWordsIndex = i; // cached to speed up next search + return i - 1; + } + return _nTextNodes - 1; +} + +void Tables::setTables(ContextTables &context) +{ + context._initialWords = _initialWordsCached; + context._dests = _destsCached; + context._linkTypes = _linkTypesCached; + context._seqNumbers = _seqNumbersCached; + context._nTextNodes = context._initialWords.size(); +} + +class Compressor; + +class XmlIndex : public Index +{ +private: + VectorBtreeParameters *_edgesParams; + FullVectorBtree *_edges; + ContextTables *_contextTables; + std::fstream *_contextsFile; + IntegerArray _contextsOffsets; + std::vector _contextsData; + std::vector _linkNames; +protected: + virtual void writeOutOffsets(); +public: + XmlIndex(const fs::path &index, bool update) + : Index(index, update), _edgesParams(0), _edges(0), _contextTables(0), _contextsFile(0) {} + void init(); + void close(); + virtual ~XmlIndex() { delete _edgesParams; delete _edges; delete _contextTables; } + std::fstream& getContextsFile(); + using Index::compress; + virtual void compress(int docID, int titleID, + std::vector &locations, + std::vector &extents, + int k, const Compressor &contextTables); + const std::vector& getLinkNames() { return _linkNames; } +}; + +void XmlIndex::init() +{ + Index::init(); + if (_edgesParams) delete _edgesParams; + _edgesParams = new VectorBtreeParameters(*_schema, "EDGE", 9); + if (_edgesParams->readState() == false) + _edgesParams->setBlockSize(1024); + _edges = new FullVectorBtree(_edgesParams, _update); + if (!_contextsOffsets.empty()) + { + _contextsData = readByteArray("CONTEXTS"); +#if 0 + _linkNames = (String[])readObject("LINKNAMES"); +#endif + _contextTables = new ContextTables(_contextsOffsets, _contextsData, _linkNames); + } +} + +void XmlIndex::writeOutOffsets() +{ + Index::writeOutOffsets(); + if (!_contextsOffsets.empty()) + { + std::fstream &out = getOffsetsFile(); + Compressor offsets2; + char k = static_cast(offsets2.compressAscending(_contextsOffsets)); + out << k; + offsets2.write(out); + } +} + +std::fstream& XmlIndex::getContextsFile() +{ + if (!_contextsFile) + _contextsFile = getRAF("CONTEXTS", _update); + return *_contextsFile; +} + +void XmlIndex::close() +{ + if (_contextsFile) + { + _contextsFile->close(); + delete _contextsFile; + _contextsFile = 0; + } + _edges->close(); + if (_update) + _edgesParams->updateSchema(); + Index::close(); +} + +class Tokenizer +{ +private: + UnicodeString s; + BreakIterator *bi; + int32_t start; + UConverter *utf8; + std::vector utfbuffer; +public: + Tokenizer(); + ~Tokenizer(); + void setText(const xmlChar *text); + std::string nextToken(); +}; + +Tokenizer::Tokenizer() : start(BreakIterator::DONE), utfbuffer(64) +{ + UErrorCode status = U_ZERO_ERROR; + bi = BreakIterator::createWordInstance(Locale::getUS(), status); + utf8 = ucnv_open("utf-8", &status); +} + +Tokenizer::~Tokenizer() +{ + delete bi; + ucnv_close(utf8); +} + +void Tokenizer::setText(const xmlChar *text) +{ + UErrorCode status = U_ZERO_ERROR; + s = UnicodeString((const char*)text, -1, utf8, status); + bi->setText(s); + start = ubrk_first(bi); +} + +std::string Tokenizer::nextToken() +{ + std::string ret; + + int32_t end = ubrk_next(bi); + while (end != BreakIterator::DONE) + { + if (ubrk_getRuleStatus(bi) != UBRK_WORD_NONE) + break; + start = end; + end = ubrk_next(bi); + } + + if (end != -1 && end != start) + { + UnicodeString token(s, start, end-start); + token = token.toLower(); + size_t needed = 0; + + UErrorCode status = U_ZERO_ERROR; + while ((needed = token.extract(&utfbuffer[0], utfbuffer.size(), utf8, status)) > utfbuffer.size()) + utfbuffer.resize(utfbuffer.size() * 2); + + ret = std::string(&utfbuffer[0], needed); + start = end; + } + + return ret; +} + +typedef std::vector Vector; + +ConceptLocation::ConceptLocation(int conceptID, int begin, int end) : + _concept(conceptID), _begin(begin), _end(end) +{ +} + +#ifdef EMULATEORIGINALSORT +class ConceptLocationSorter +{ +public: + virtual bool smallerThan(const ConceptLocation &a, const ConceptLocation &b) = 0; +private: + // part of quicksearch + int partition(std::vector &array, int p, int r) + { + ConceptLocation x = array[(p + r)/2]; + int i = p - 1, j = r + 1; + while (true) + { + while (smallerThan(x, array[--j])) + ; + while (smallerThan(array[++i], x)) + ; + if (i < j) + { + ConceptLocation t = array[i]; + array[i] = array[j]; + array[j] = t; + } + else + return j; + } + } +public: + void quicksort(std::vector &array, int p, int r) + { + while (p < r) + { + int q = partition(array, p, r); + quicksort(array, p, q); + p = q + 1; + } + } +}; + +class ConceptSorter : public ConceptLocationSorter +{ +public: + bool smallerThan(const ConceptLocation &a, const ConceptLocation &b) + { + return a._concept < b._concept; + } +}; + +class PositionSorter : public ConceptLocationSorter +{ +public: + bool smallerThan(const ConceptLocation &a, const ConceptLocation &b) + { + return a._begin < b._begin || a._begin == b._begin && a._end < b._end; + } +}; + +#else + +class ConceptSorter +{ +public: + bool operator()(const ConceptLocation &a, const ConceptLocation &b) const + { + return a._concept < b._concept; + } +}; + +class PositionSorter +{ +public: + bool operator()(const ConceptLocation &a, const ConceptLocation &b) const + { + return a._begin < b._begin || (a._begin == b._begin && a._end < b._end); + } +}; + +#endif + +void ConceptLocation::sortByPosition(std::vector &array, int i1, int i2) +{ +#ifdef EMULATEORIGINALSORT + PositionSorter _pComp; + _pComp.quicksort(array, i1, i2 - 1); +#else + std::vector::iterator begin = array.begin(); + std::vector::iterator end = begin; + std::advance(begin, i1); + std::advance(end, i2); + std::sort(begin, end, PositionSorter()); +#endif +} + +void ConceptLocation::sortByConcept(std::vector &array, int i1, int i2) +{ +#ifdef EMULATEORIGINALSORT + ConceptSorter _cComp; + _cComp.quicksort(array, i1, i2 - 1); +#else + std::vector::iterator begin = array.begin(); + std::vector::iterator end = begin; + std::advance(begin, i1); + std::advance(end, i2); + std::sort(begin, end, ConceptSorter()); +#endif +} + +typedef std::map NodeHashtable; +typedef std::hash_map LinkHashTable; + +class IndexAdapter +{ +private: + static int StackSize; + const char* _indexText_Name; + const char* _indexElement_Name; + const char* _indexAttribute_Name; + const char* _nodeID_Name; + const char* _tokenizer_Name; + const char* _attributeName_Name; + std::vector _indexOnOffStack; + int _sp; + int _tsp; + std::vector< std::string > _attributeStack; + xmlNodePtr _currentNode; + int _attrSP; + void storeLocation(const std::string &token, int number); + void storeLocation(const std::string &token) { storeLocation(token, _lastWordNumber++); } + void storeEdge(int relation, int seqNumber, int destination); + + void startElement(xmlNodePtr node); + void attribute(const char *name, const char *value); + void characters(const xmlChar *str); + void endElement(xmlNodePtr node); + + void indexText(const xmlChar *str); + + Vector _textNodes; + NodeHashtable _numberedNodes; +public: + HashSet _stoplist; + LinkHashTable _linkCodes; + std::vector _linknames; + static int CurrenMaxLinkCode; + std::vector _locations; + int _availContextNumber; + IntegerArray _initialWords; + IntegerArray _links; + IntegerArray _dests; + IntegerArray _seqNumbers; + int _lastWordNumber; + int _firstWord; + bool _anyLocationsStored; + XmlIndex *_index; +private: + static int InitSize; + int _size; +public: + IndexAdapter(); + void process(xmlNodePtr node, xmlDocPtr doc); + void init(); + void finish(); + int intern(const std::string &name) { return _index->intern(name); } + int getLinkCode(const std::string &linkName); +}; + +int IndexAdapter::StackSize = 64; +int IndexAdapter::InitSize = 4096; +int IndexAdapter::CurrenMaxLinkCode = 0; + +IndexAdapter::IndexAdapter() + : _indexOnOffStack(StackSize), _attributeStack(StackSize), + _anyLocationsStored(false), _size(InitSize) +{ + _indexText_Name = "text"; + _indexElement_Name = "element"; + _indexAttribute_Name = "attribute"; + _nodeID_Name = "nodeID"; + _tokenizer_Name = "tokenizer"; + _attributeName_Name = "attributeName"; +} + +void IndexAdapter::storeLocation(const std::string &token, int number) +{ + int concept = intern(token); + HCDBG(std::cerr << "storeLocation of number " << number << "for token " + << token << " as conceptlocation " << concept << std::endl); + _locations.push_back(ConceptLocation(concept, number, number)); +} + +void IndexAdapter::storeEdge(int relation, int seqNumber, int destination) +{ + _links.push_back(relation); + _seqNumbers.push_back(seqNumber); + _dests.push_back(destination); + HCDBG(std::cerr << "storeEdge" << std::endl); +} + +void IndexAdapter::finish() +{ + _numberedNodes.clear(); + _dests.clear(); + _seqNumbers.clear(); + _links.clear(); + + int nTextNodes = _textNodes.size(); + _availContextNumber = nTextNodes; + // vector to hold parents of text nodes + Vector parents; + /***** + for each of the text nodes its sequence number is stored + as well as the index of its parent (in _dests) + _link is not stored as it is always "text()" + _availContextNumber only used to number parent element contexts + ******/ + for (int i = 0; i < nTextNodes; i++) + { + xmlNodePtr node = _textNodes[i]; + xmlNodePtr parent = node->parent; + // find this text node's seq number + int counter = 1; + xmlNodePtr sibling = parent->xmlChildrenNode; + while (sibling && sibling != node) + { + if (xmlNodeIsText(sibling)) + ++counter; + sibling = sibling->next; + } + _seqNumbers.push_back(counter); + // check whether parent already encountered + NodeHashtable::const_iterator number = _numberedNodes.find(parent); + if (number == _numberedNodes.end()) // not yet seen + { + int newContext = _availContextNumber++; + _numberedNodes.insert(NodeHashtable::value_type(parent, newContext)).first->second = newContext; + _dests.push_back(newContext); + // enqueue parent: its parent will need a number too + parents.push_back(parent); + // System.out.println(parent.getName().toString() + + // " -> " + newContext); + } + else + { + _dests.push_back(number->second); + } + } // end for + + _textNodes.clear(); + + // store info about element ancestry of the above text nodes + // grandparents are added to the end of the vector + int rootElementPos = 0; + for (size_t i = 0; i < parents.size(); i++) + { + xmlNodePtr node = parents[i]; + + std::string name((const char*)(node->name)); + + xmlNodePtr parent = node->parent; + + _links.push_back(getLinkCode(name)); + +// if (parent.getType() == Node.ELEMENT) // not ROOT + if (parent && parent->parent) // not ROOT + { + // find sequence number + xmlNodePtr sibling = parent->xmlChildrenNode; + int counter = 1; + while (sibling && sibling != node) + { + if (strcmp((const char*)sibling->name, (const char*)name.c_str()) == 0) + ++counter; + sibling = sibling->next; + } + + _seqNumbers.push_back(counter); + + // check whether parent already known + NodeHashtable::iterator number = _numberedNodes.find(parent); + if (number == _numberedNodes.end()) + { + int newContext = _availContextNumber++; + _numberedNodes.insert(NodeHashtable::value_type(parent, newContext)).first->second = newContext; + _dests.push_back(newContext); + // enqueue parent: its parent will need a number too + parents.push_back(parent); + //System.out.println(parent.getName().toString() + + // " -> " + newContext); + } + else + { + _dests.push_back(number->second); + } + } + else + { + _dests.push_back(0); // placeholder + _seqNumbers.push_back(1); + rootElementPos = i + nTextNodes; + // System.out.println("rootElementPos = " + i); + } + } // end for + + if (_dests.empty()) + _dests.push_back(0); + + // index to sentinel + _dests[rootElementPos] = _availContextNumber; +} // end public void finish + +void IndexAdapter::init() +{ + _sp = -1; + _tsp = -1; + _attrSP = -1; + _lastWordNumber = 0; + _anyLocationsStored = false; + _availContextNumber = 0; + // all the contexts' tables + _initialWords.clear(); + _locations.clear(); +} + +void IndexAdapter::attribute(const char *name, const char *value) +{ + HCDBG(std::cerr << "attribute: " << name << " = " << value << std::endl); + if (strcmp(name, _nodeID_Name) == 0) + _currentNode = (xmlNodePtr)(strtol(value, NULL, 10)); + else if (strcmp(name, _tokenizer_Name) == 0) + { + if (strcmp(value, "com.sun.xmlsearch.util.SimpleTokenizer") != 0) + std::cerr << "changing tokenizers not implemented in C++ version of HelpLinker" + << " because no other tokenizers were referenced in the helpcontent2 source" + << std::endl; + } + else if (strcmp(name, _attributeName_Name) == 0) + { + //namespace prefix ? + std::string attrVal = std::string("index:") + value; + std::cout << "attrVal = " << attrVal << std::endl; + _attributeStack[_attrSP] = std::string(name) + '<' + value + '<' + attrVal; + storeLocation("+<" + _attributeStack[_attrSP]); + } +} + +void IndexAdapter::indexText(const xmlChar *text) +{ + static Tokenizer tokenizer; + tokenizer.setText(text); + _firstWord = _lastWordNumber; + _anyLocationsStored = false; + + std::string lowercaseToken = tokenizer.nextToken(); + while (!lowercaseToken.empty()) + { + HCDBG(std::cerr << "token is: " << lowercaseToken << std::endl); +#ifdef EMULATEORIGINAL + if ((lowercaseToken.size() == 1) && isdigit(lowercaseToken[0])) + { + lowercaseToken = tokenizer.nextToken(); + continue; + } +#endif + if (std::find(_stoplist.begin(), + _stoplist.end(), lowercaseToken) == _stoplist.end()) + { + storeLocation(lowercaseToken); + _anyLocationsStored = true; + } + else + _lastWordNumber++; + lowercaseToken = tokenizer.nextToken(); + } + + if (_anyLocationsStored && _firstWord > -1) + { + _initialWords.push_back(_firstWord); + HCDBG(std::cerr << "appending " << _firstWord << std::endl); + _textNodes.push_back(_currentNode); + } + // reset before next batch + _firstWord = -1; +} + +void IndexAdapter::characters(const xmlChar *str) +{ + if (!str) + std::cerr << "rats, no characters!" << std::endl; + + HCDBG(std::cerr << "IndexAdapter::characters of " << str << std::endl); + HCDBG(std::cerr << _sp << " : " << _indexOnOffStack[_sp] << std::endl); + + if (_sp >= 0 && _indexOnOffStack[_sp]) + { + indexText( str ); + } +} + +void IndexAdapter::startElement(xmlNodePtr node) +{ + const char *name = (const char*)(node->name); + + HCDBG(std::cerr << "startElement is " << name << std::endl); + + if (strcmp(name, _indexElement_Name) == 0) + { + _indexOnOffStack[++_sp] = true; + // pop Tokenizer stack + // following attribute can push selected Tokenizer + if (_tsp != -1) + _tsp--; + } + else if (strcmp(name, _indexText_Name) == 0) + { + } + else if (strcmp(name, _indexAttribute_Name) == 0) + { + _attrSP++; + } +} + +void IndexAdapter::endElement(xmlNodePtr node) +{ + const char *name = (const char*)(node->name); + HCDBG(std::cerr << "endElement is " << name << std::endl); + if (strcmp(name, _indexElement_Name) == 0) + _sp--; + else if (strcmp(name, _indexText_Name) == 0) + { + // reset + } + else if (strcmp(name, _indexAttribute_Name) == 0) + storeLocation("-<" + _attributeStack[_attrSP--]); +} + +int IndexAdapter::getLinkCode(const std::string &linkName) +{ + LinkHashTable::iterator code = _linkCodes.find(linkName); + if (code != _linkCodes.end()) + return code->second; + else + { + _linknames.push_back(linkName); + int newCode = CurrenMaxLinkCode++; + _linkCodes.insert(LinkHashTable::value_type(linkName, newCode)).first->second = newCode; + return newCode; + } +} + +void IndexAdapter::process(xmlNodePtr node, xmlDocPtr doc) +{ + startElement(node); + + for (xmlAttrPtr attr = node->properties; attr; attr = attr->next) + { + xmlChar *value = xmlNodeListGetString(doc, attr->children, 0); + attribute((const char*)(attr->name), (const char*)value); + xmlFree(value); + } + + if (xmlNodeIsText(node)) + { + xmlChar *str = xmlNodeListGetString(doc, node, 1); + characters(str); + xmlFree(str); + } + + for (xmlNodePtr test = node->xmlChildrenNode; test; test = test->next) + process(test, doc); + + endElement(node); +} + +class XmlIndexBuilder +{ +private: + fs::path _transformLocation; + xsltStylesheetPtr _defaultTransform; + xsltStylesheetPtr _indexingTransform; + IndexAdapter _indexAdapter; + int _currentDocID; + void reset(); + xsltStylesheetPtr getTransform(const std::string &stylesheetName); +public: + XmlIndexBuilder() : _defaultTransform(0), _indexingTransform(0) {} + XmlIndexBuilder(const fs::path &dir); + ~XmlIndexBuilder(); + void clearIndex(); + void setTransformLocation(const fs::path &filelocation); + void init(const std::string &transform); + void initXmlProcessor(const std::string &transform); + void indexDocument(xmlDocPtr document, const std::string &docURL, const std::string &title); + int intern(const std::string &name); + void openDocument(const std::string &name); + void closeDocument(const std::string &name); + void close(); +}; + +void XmlIndexBuilder::close() +{ + fs::path fullname = _indexAdapter._index->indexFile("LINKNAMES"); + std::fstream _linkFile(fullname.native_file_string().c_str(), std::ios::out | std::ios::trunc | std::ios::binary); + +#ifdef EMULATEORIGINAL + static const unsigned char vectorheader[] = + { + 0xAC, 0xED, 0x00, 0x05, 0x75, 0x72, 0x00, 0x13, + 0x5B, 0x4C, 0x6A, 0x61, 0x76, 0x61, 0x2E, 0x6C, + 0x61, 0x6E, 0x67, 0x2E, 0x53, 0x74, 0x72, 0x69, + 0x6E, 0x67, 0x3B, 0xAD, 0xD2, 0x56, 0xE7, 0xE9, + 0x1D, 0x7B, 0x47, 0x02, 0x00, 0x00, 0x78, 0x70 + }; + + _linkFile.write((const char*)(&vectorheader[0]), sizeof(vectorheader)); + writeInt(_linkFile, _indexAdapter._linknames.size()); + std::vector::iterator aEnd = _indexAdapter._linknames.end(); + for (std::vector::iterator aIter = _indexAdapter._linknames.begin(); + aIter != aEnd; ++aIter) + { + HCDBG(std::cerr << "linkname is " << *aIter << std::endl); + _linkFile << 't'; + writeShort(_linkFile, aIter->size()); + _linkFile << *aIter; + } +#else + std::vector::iterator aEnd = _indexAdapter._linknames.end(); + for (std::vector::iterator aIter = _indexAdapter._linknames.begin(); + aIter != aEnd; ++aIter) + { + _linkFile << *aIter << '\n'; + } +#endif +#if 0 + + // output link codes + /* + Enumeration keys = _linknames.elements(); + while (keys.hasMoreElements()) + System.out.println((String)keys.nextElement()); + */ +#endif + _indexAdapter._index->close(); + std::cout << "done" << std::endl; +} + +int XmlIndexBuilder::intern(const std::string &name) +{ + return _indexAdapter.intern(name); +} + +void XmlIndexBuilder::openDocument(const std::string &name) +{ + if (_currentDocID != 0) + { + std::cerr << "document already open" << std::endl; + exit(-1); + } + _currentDocID = intern( PrefixTranslator::translatePrefix(name) ); + reset(); // reset context gathering state +} + +int BitBuffer::InitSize = 256; +int BitBuffer::NBits = 32; +int BitBuffer::BitsInByte = 8; +int BitBuffer::BytesInInt = 4; + +void Compressor::encode(const IntegerArray &pos, int k) +{ + HCDBG(std::cerr << "1:start this encode of " << k << "size of " + << pos.size() << std::endl); + unsigned int n1 = 0; + unsigned int power = 1 << k; + for (size_t i = 0; i < pos.size(); i++) + { + HCDBG(std::cerr << "1: loop " << i << std::endl); + unsigned int n2 = pos[i] >> k; + int rem = pos[i] % power; + HCDBG(std::cerr << "1: n1, n2 : " << n1 << "," << n2 << std::endl); + if (n2 != n1) + { + unsigned int min = n1; + unsigned int a = n1; + int lev = 0, power2 = 1; + if (n2 > n1) + for (size_t max = n1; max < n2; a >>= 1, power2 <<= 1, lev++) + if ((a & 1) != 0) + min -= power2; + else + max += power2; + else + for ( ; min > n2; a >>= 1, power2 <<= 1, lev++) + if ((a & 1) != 0) + min -= power2; + // lev 0s, 1, lev bits of (n2 - min) plus following value + // no 'V' symbol needed here + if (lev*2 + 1 + k <= NBits) + _buffer.append((1<> k; + int rem = pos[i] % power; + HCDBG(std::cerr << "2: n1, n2 : " << n1 << "," << n2 << std::endl); + if (n2 != n1) + { + int min = n1, a = n1; + int lev = 0, power2 = 1; + if (n2 > n1) + for (int max = n1; max < n2; a >>= 1, power2 <<= 1, lev++) + if ((a & 1) != 0) + min -= power2; + else + max += power2; + else + for ( ; min > n2; a >>= 1, power2 <<= 1, lev++) + if ((a & 1) != 0) + min -= power2; + // lev 0s, 1, lev bits of (n2 - min) plus following value + if (lev*2 + 1 + k <= NBits) + _buffer.append((1< 0; k--) + { + _buffer.clear(); + encode(array, k); + if (_buffer.bitCount() < min) + { + saved.setFrom(_buffer); + min = _buffer.bitCount(); + minK = k; + } + else + break; + } + } + + _buffer.setFrom(saved); + return minK; +} + +int Compressor::compressAscending(const IntegerArray &array) +{ + IntegerArray differences(array.size()); + toDifferences(array, differences); + return minimize(differences, BeginK); +} + +int Compressor::NBits = 32; +int Compressor::BeginK = 5; + +class DocumentCompressor +{ +public: + static int NConceptsInGroup; + static int BitsInLabel; + static int DefaultSize; +private: + int _nGroups; + int _nExtents; + unsigned int _freeComp; + int _kk; + Compressor *_currentCompressor; + std::vector _compressors; + Compressor _kCompr; + Compressor _lCompr; + Compressor _mCompr; + Compressor _posCompressor; + IntegerArray _kTable; // k's for the series + IntegerArray _lTable; // lengths of the C/P groups + IntegerArray _maxConcepts; // maximal concepts in CP + IntegerArray _concepts; + IntegerArray _documents; + IntegerArray _microIndexOffsets; + IntegerArray _titles; + // _contextsOffsets for use in XML indexing + IntegerArray _contextsOffsets; + IntegerArray _positions; + IntegerArray _labels; + +public: + DocumentCompressor() : _currentCompressor(0), _compressors(DefaultSize) {} + void writeOutMicroIndex(std::fstream &output, + std::vector &locations, + std::vector &extents) + { + HCDBG(std::cerr << "writeOutMicroIndex start" << std::endl); + encode(locations, NConceptsInGroup); + HCDBG(std::cerr << "writeOutMicroIndex end encode" << std::endl); + if (!extents.empty()) + encodeExtents(extents); + HCDBG(std::cerr << "writeOutMicroIndex finalize" << std::endl); + finalizeEncoding(); + HCDBG(std::cerr << "writeOutMicroIndex write" << std::endl); + writeOut(output); + HCDBG(std::cerr << "writeOutMicroIndex end" << std::endl); + } +private: + void encode(std::vector &locations, int nConcepts) + { + int initK = 4; + // first sort by concept only +#ifdef CMCDEBUG + for (size_t i = 0; i < locations.size(); ++i) + fprintf(stderr, "unsorted is %d\n", locations[i].getConcept()); +#endif + HCDBG(std::cerr << "start sort" << std::endl); + ConceptLocation::sortByConcept(locations, 0, locations.size()); + HCDBG(std::cerr << "end sort" << std::endl); +#ifdef CMCDEBUG + for (size_t i = 0; i < locations.size(); ++i) + fprintf(stderr, "sorted is %d\n", locations[i].getConcept()); +#endif + + // using the fact that concepts are already sorted + // count of groups of 'nConcepts' + // go for differences directly + + // clear the state + _nGroups = 0; + _nExtents = 0; + _kTable.clear(); + _lTable.clear(); + _concepts.clear(); + _maxConcepts.clear(); + _kCompr.clear(); + _lCompr.clear(); + _mCompr.clear(); + for (size_t i = 0; i < _compressors.size(); i++) + _compressors[i].clear(); + _freeComp = 0; + _currentCompressor = NULL; + // end of resetting state + + int conceptCounter = 0; + int fromIndex = 0; + int prevMax = 0; + int last = locations[0].getConcept(); // init w/ first ID + nextCompressor(); + _concepts.push_back(last); + for (size_t i = 0;;) + { + for (; i < locations.size() && locations[i].getConcept() == last; i++) + locations[i].setConcept(conceptCounter); + if (i == locations.size()) + { + if (!_concepts.empty()) + { + ++_nGroups; + _kTable.push_back(_currentCompressor->minimize(_concepts, initK)); + } + encodePositions(locations, fromIndex, i, BitsInLabel); + break; + } + else + { // new concept (group?) + if (++conceptCounter == nConcepts) + { + ++_nGroups; + // we are looking at the beginning of a new group + // last is maximal for the group just finished + // it won't be stored in concepts array but maxConcepts + _concepts.pop_back(); + HCDBG(fprintf(stderr, "_maxConcepts %d %d -> %d\n", last, prevMax, last - prevMax)); + _maxConcepts.push_back(last - prevMax); + prevMax = last; + _kTable.push_back(_currentCompressor->minimize(_concepts, initK)); + +#ifdef CMCDEBUG + for(size_t p = 0; p < locations.size(); ++p) + std::cerr << "microindex2 this testing is " << locations[p].getBegin() << + locations[p].getEnd() << " : " << locations[p].getConcept() << std::endl; +#endif + + HCDBG(std::cerr << "two encodePositions " << fromIndex << " " << i << std::endl); + encodePositions(locations, fromIndex, i, BitsInLabel); + fromIndex = i; + nextCompressor(); + _concepts.clear(); + conceptCounter = 0; + } + _concepts.push_back(locations[i].getConcept() - last); + last = locations[i].getConcept(); + } + } + } + + void encodePositions(std::vector &locations, int from, int to, int cK) + { + int initK = 3; + int lastPos, k; + // sort in place by psitions only +#ifdef CMCDEBUG + for (int i = from; i < to; ++i) + fprintf(stderr, "unsorted is %d %d\n", locations[i].getBegin(), locations[i].getEnd()); +#endif + ConceptLocation::sortByPosition(locations, from, to); +#ifdef CMCDEBUG + for (int i = from; i < to; ++i) + fprintf(stderr, "sorted is %d %d\n", locations[i].getBegin(), locations[i].getEnd()); +#endif + _positions.clear(); + _labels.clear(); + _positions.push_back(lastPos = locations[from].getBegin()); + _labels.push_back(locations[from].getConcept()); // now: a label + // skip duplicates + for (int i = from, j = from + 1; j < to; j++) + { + if (locations[i].equals(locations[j]) == false) + { + i = j; + HCDBG(std::cerr << "i is " << i << "locations begin is " + << locations[i].getBegin() << "last pos is " << lastPos << std::endl); + _positions.push_back(locations[i].getBegin() - lastPos); + lastPos = locations[i].getBegin(); + _labels.push_back(locations[i].getConcept()); // now: a label + } + } + // first find k by minimizing just positions w/o labels + _kTable.push_back(k = _posCompressor.minimize(_positions, initK)); + _posCompressor.clear(); + HCDBG(std::cerr << "start encodePositions" << std::endl); + _posCompressor.encode(_positions, _labels, k, cK); + HCDBG(std::cerr << "end encodePositions" << std::endl); + _currentCompressor->concatenate(_posCompressor); + } + + void encodeExtents(std::vector &extents) + { + // side effects: + // 'k3' added to _kTable + // a number of compressors populated: header + lengths' lists + int initK = 4; + int c = 0; + IntegerArray concepts; //difference + IntegerArray lengths; + IntegerArray kTable; + IntegerArray lTable; + // reserve a compressor for concatenated tables + nextCompressor(); + Compressor *extentsHeader = _currentCompressor; + std::vector::const_iterator aEnd = extents.end(); + for (std::vector::const_iterator aIter = extents.begin(); + aIter != aEnd; ++aIter) + { + if (aIter->getConcept() != c) + { + if (c != 0) + { + _nExtents++; + nextCompressor(); + kTable.push_back(_currentCompressor->minimize(lengths, initK)); + lTable.push_back(_currentCompressor->byteCount()); + } + concepts.push_back(aIter->getConcept() - c); + c = aIter->getConcept(); + lengths.clear(); + lengths.push_back(aIter->getLength()); + } + else + lengths.push_back(aIter->getLength()); + } + // last table of lengths + nextCompressor(); + kTable.push_back(_currentCompressor->minimize(lengths, initK)); + lTable.push_back(_currentCompressor->byteCount()); + Compressor compressor1; + kTable.push_back(compressor1.minimize(lTable, initK)); + Compressor compressor2; + kTable.push_back(compressor2.minimize(concepts, initK)); + _kTable.push_back(extentsHeader->minimize(kTable, initK)); // k3 + extentsHeader->concatenate(compressor1); + extentsHeader->concatenate(compressor2); + } + + void finalizeEncoding() + { + if (_nGroups > 1) + { + // if extents follow C/P groups we need the length of the last group + int limit = _nExtents > 0 ? _freeComp : _freeComp - 1; + for (int j = 0; j < limit; j++) // length of last not saved + _lTable.push_back(_compressors[j].byteCount()); + + _kTable.push_back(_mCompr.minimize(_maxConcepts, 3)); + _kTable.push_back(_lCompr.minimize(_lTable, 3)); + _kk = _kCompr.minimize(_kTable, 3); + _kCompr.concatenate(_lCompr); + _kCompr.concatenate(_mCompr); + } + else if (_nGroups == 1 && _nExtents > 0) + { + // length of the single C/P group packed with k-s + _kTable.push_back(_compressors[0].byteCount()); + _kk = _kCompr.minimize(_kTable, 3); + } + } + + void writeOut(std::fstream &out) + { + if (_nExtents == 0) + { + if (_nGroups > 1) + { + unsigned char byte = static_cast((0x80 | _kk)); + out << byte; + HCDBG(std::cerr << "writeOut of " << int(byte) << std::endl); + _kCompr.write(out); // concatenated k,l,m + for (size_t j = 0; j < _freeComp; j++) + _compressors[j].write(out); + } + else // single group, no extents; code: 00 + { + out << (unsigned char)(_kTable[0]); // k1 + out << (unsigned char)(_kTable[1]); // k2 + _compressors[0].write(out); // C/P + } + } + else + { // extents + unsigned char byte = static_cast( + (_nGroups > 1 ? 0xC0 : 0x40) | _kk); + out << byte; + _kCompr.write(out); + for (size_t j = 0; j < _freeComp; j++) + _compressors[j].write(out); + } + } + + Compressor* nextCompressor() + { + if (_freeComp == _compressors.size()) + _compressors.push_back(Compressor()); + return _currentCompressor = &_compressors[_freeComp++]; + } + + int byteCount() + { + if (_nGroups == 1 && _nExtents == 0) + return 2 + _compressors[0].byteCount(); + else + { + int result = 1; // initial kk + result += _kCompr.byteCount(); + for (size_t j = 0; j < _freeComp; j++) + result += _compressors[j].byteCount(); + return result; + } + } +}; + +int DocumentCompressor::NConceptsInGroup = 16; +int DocumentCompressor::BitsInLabel = 4; +int DocumentCompressor::DefaultSize = 32; + +DocumentCompressor& Index::getDocumentCompressor() +{ + if (!_documentCompressor) + _documentCompressor = new DocumentCompressor(); + return *_documentCompressor; +} + +void Index::compress(int docID, int titleID, + std::vector &locations, + std::vector &extents) +{ + std::fstream &positions = getPositionsFile(); + + positions.seekg(0, std::ios::end); + long currentEnd = positions.tellg(); + if (currentEnd < 0) currentEnd = 0; + positions.clear(); + positions.seekg(currentEnd, std::ios::beg); + + _documents.push_back(docID); + _microIndexOffsets.push_back(currentEnd); + HCDBG(std::cerr << "_microIndexOffsets pushed back " << currentEnd << std::endl); + HCDBG(std::cerr << "added title id of " << titleID << std::endl); + _titles.push_back(titleID); + + getDocumentCompressor().writeOutMicroIndex(positions, + locations, extents); +} + +void Index::writeOutOffsets() +{ + Compressor documents; + int k1 = documents.minimize(_documents, 8); + Compressor offsets; + int k2 = offsets.compressAscending(_microIndexOffsets); + Compressor titles; + int k3 = titles.minimize(_titles, 8); // 8 is the starting k + std::fstream &out = getOffsetsFile(); + out.seekp(0); // position at beginning + out.clear(); + unsigned char byte; + byte = static_cast(k1); + out << byte; + HCDBG(fprintf(stderr, "a: offset dump of %x\n", byte)); + documents.write(out); + byte = static_cast(k2); + out << byte; + HCDBG(fprintf(stderr, "b: offset dump of %x\n", byte)); + offsets.write(out); + byte = static_cast(k3); + out << byte; + HCDBG(fprintf(stderr, "c: offset dump of %x\n", byte)); + titles.write(out); +} + +Index::~Index() +{ + delete _schema; + delete _dictParams; + delete _dict; + delete _positionsFile; + delete _offsetsFile; + delete _documentCompressor; +} + +void XmlIndex::compress(int docID, int titleID, + std::vector &locations, + std::vector &extents, + int k, const Compressor &contextTables) +{ + HCDBG(std::cerr << "start compress" << std::endl); + HCDBG(std::cerr << "docID : " << docID << " titleID : " << titleID << + "locations size : " << locations.size() << "extents size : " << extents.size() << std::endl); + Index::compress(docID, titleID, locations, extents); + HCDBG(std::cerr << "end compress" << std::endl); + + std::fstream& contexts = getContextsFile(); + + contexts.seekp(0, std::ios::end); + long currentEnd = contexts.tellp(); + if (currentEnd < 0) currentEnd = 0; + contexts.clear(); + contexts.seekp(currentEnd); + writeByte(contexts, static_cast(k)); + contextTables.write(contexts); + _contextsOffsets.push_back(currentEnd); +} + +void XmlIndexBuilder::closeDocument(const std::string &title) +{ + if (_currentDocID == 0) + { + std::cerr << "no document open" << std::endl; + exit(-1); + } + else if (!_indexAdapter._locations.empty()) + { + IntegerArray kTable; + + Compressor compressor1; + Compressor compressor2; + Compressor compressor3; + Compressor compressor4; + + kTable.push_back(compressor1.compressAscending(_indexAdapter._initialWords)); + kTable.push_back(compressor2.minimize(_indexAdapter._dests, 2)); + kTable.push_back(compressor3.minimize(_indexAdapter._links, 2)); + kTable.push_back(compressor4.minimize(_indexAdapter._seqNumbers, 2)); + + Compressor compressor0; + int k0 = compressor0.minimize(kTable, 4); + + compressor0.concatenate(compressor1); + compressor0.concatenate(compressor2); + compressor0.concatenate(compressor3); + compressor0.concatenate(compressor4); + + std::vector dummy; + _indexAdapter._index->compress(_currentDocID, intern(title), + _indexAdapter._locations, dummy, k0, compressor0); + } + else + { + // System.out.println("no indexable content"); + } + _indexAdapter._locations.clear(); + _currentDocID = 0; // state: nothing open +} + +void XmlIndexBuilder::indexDocument(xmlDocPtr doc, const std::string &docURL, const std::string &title) +{ + HCDBG(std::cerr << "Indexing " << docURL << std::endl); + + xmlNodePtr root = xmlDocGetRootElement(doc); + + openDocument(docURL); + +// xmlDocDump(stdout, doc); + xmlDocPtr res = xsltApplyStylesheet(_indexingTransform, doc, NULL); + + _indexAdapter.init(); + + // start = System.currentTimeMillis(); + root = xmlDocGetRootElement(res); + if (root) + { +// xmlDocDump(stdout, res); + for (xmlNodePtr test = root; test; test = test->next) + _indexAdapter.process(test, res); + } + xmlFreeDoc(res); + + // System.out.println((System.currentTimeMillis()-start)+" transform"); + // start = System.currentTimeMillis(); + _indexAdapter.finish(); + // System.out.println((System.currentTimeMillis()-start)+" finish"); + // start = System.currentTimeMillis(); + closeDocument(title); + // System.out.println((System.currentTimeMillis()-start)+" close"); +} + +XmlIndexBuilder::~XmlIndexBuilder() +{ + if (_defaultTransform) xsltFreeStylesheet(_defaultTransform); + if (_indexingTransform) xsltFreeStylesheet(_indexingTransform); + delete _indexAdapter._index; +} + +void XmlIndexBuilder::setTransformLocation(const fs::path &filelocation) +{ + _transformLocation = filelocation; +} + +xsltStylesheetPtr XmlIndexBuilder::getTransform(const std::string &stylesheetName) +{ + fs::path stylesheet = _transformLocation / (stylesheetName + ".xsl"); + return xsltParseStylesheetFile((const xmlChar *)stylesheet.native_file_string().c_str()); +} + +void XmlIndexBuilder::initXmlProcessor(const std::string &transform) +{ + _defaultTransform = getTransform("default"); + _indexingTransform = getTransform(transform); +} + +void XmlIndexBuilder::init(const std::string &transform) +{ + _indexAdapter._index->init(); +#ifdef EMULATEORIGINAL + //some kind of bug in the original AFAICS + _indexAdapter._stoplist.push_back("andnull"); +#endif + reset(); + + // initialize vector and hashtable + const std::vector &linkNames = _indexAdapter._index->getLinkNames(); + std::vector::const_iterator aEnd = linkNames.end(); + for (std::vector::const_iterator aIter = linkNames.begin(); + aIter != aEnd; ++aIter) + { + _indexAdapter.getLinkCode(*aIter); + } + + initXmlProcessor(transform); +} + +void XmlIndexBuilder::reset() +{ + _indexAdapter._availContextNumber = 0; + _indexAdapter._lastWordNumber = 0; + _indexAdapter._locations.clear(); + _indexAdapter._anyLocationsStored = false; + // all the contexts' tables + _indexAdapter._initialWords.clear(); + _indexAdapter._dests.clear(); + _indexAdapter._links.clear(); + _indexAdapter._seqNumbers.clear(); +} + +XmlIndexBuilder::XmlIndexBuilder(const fs::path &indexDir) + : _defaultTransform(0), _indexingTransform(0), + _currentDocID(0) +{ + HCDBG(std::cerr << "indexDir is " << indexDir.native_directory_string() << std::endl); + _indexAdapter._index = new XmlIndex(indexDir, true); +} + +void XmlIndexBuilder::clearIndex() +{ + _indexAdapter._index->clear(); +} + +class HelpLinker +{ +public: + static void main(std::vector &args); +private: + HelpLinker() : init(true), xmlIndexBuilder(NULL) {} + ~HelpLinker() { delete xmlIndexBuilder; } + JarOutputStream jarOutputStream; + static int locCount, totCount; + static Stringtable additionalFiles; + static HashSet helpFiles; + static fs::path sourceRoot; + static fs::path embeddStylesheet; + static fs::path indexStylesheet; + static fs::path outputFile; + static std::string module; + static std::string lang; + static std::string hid; + fs::path indexDirName; + Stringtable hidlistTranslation; + fs::path indexDirParentName; + bool init; + XmlIndexBuilder* xmlIndexBuilder; + void initXMLIndexBuilder(); + void createFileFromBytes(const std::string &fileName, + const std::string &defaultXSL); + void closeXMLIndexBuilder() + { + xmlIndexBuilder->close(); + } + void link(); + void addBookmark( DB* dbBase, std::string thishid, + const std::string& fileB, const std::string& anchorB, + const std::string& jarfileB, const std::string& titleB ); +#if 0 + /** + * @param outputFile + * @param module + * @param lang + * @param hid + * @param helpFiles + * @param additionalFiles + */ + + private HelpURLStreamHandlerFactory urlHandler = null; +#endif +}; + +namespace URLEncoder +{ + static std::string encode(const std::string &rIn) + { + const char *good = "!$&'()*+,-.:=@_"; + static const char hex[17] = "0123456789ABCDEF"; + + std::string result; + for (size_t i=0; i < rIn.length(); ++i) + { + unsigned char c = rIn[i]; + if (isalnum (c) || strchr (good, c)) + result += c; + else { + result += '%'; + result += hex[c >> 4]; + result += hex[c & 0xf]; + } + } + return result; + } +} + +JarOutputStream::JarOutputStream() +{ + perlline << "use Archive::Zip qw(:ERROR_CODES); "; + perlline << "my $zip = Archive::Zip->new(); "; +} + +std::string replaceAll(std::string result, + const std::string &search, const std::string &replace) +{ + std::string::size_type pos = 0; + while(1) + { + pos = result.find(search, pos); + if (pos == std::string::npos) break; + result.replace(pos, search.size(), replace); + pos += replace.size(); + } + return result; +} + +void JarOutputStream::addFile(const std::string &fileName, const std::string &name) +{ + perlline << "$zip->addFile(\"" << replaceAll(fileName, "\\", "\\\\") << "\", \"" << name << "\"); "; +} + +void JarOutputStream::addTree(const std::string &tree, const std::string &name) +{ + perlline << "$zip->addTree(\"" << replaceAll(tree, "\\", "\\\\") << "\", \"" << name << "\"); "; +} + +void JarOutputStream::dontCompress(const std::string &key) +{ + perlline << "my $member = $zip->memberNamed(\"" << key << "\"); "; + perlline << "if ($member) { $member->desiredCompressionMethod( COMPRESSION_STORED ); } "; +} + +void JarOutputStream::commit() +{ + perlline << "$zip->writeToFileNamed(\"" << replaceAll(getname().native_file_string(), "\\", "\\\\") << "\"); "; + + fs::path tmp = getname(); + tmp.append(".perl"); + std::string perlfile = tmp.native_file_string(); + std::ofstream fos(perlfile.c_str()); + fos << perlline.str(); + fos.close(); + + std::string myperl( getenv( "PERL" ) ); + if myperl.empty() myperl = "perl"; + std::string commandline; + const std::string is4nt ( getenv( "USE_SHELL" ) ); + if( !is4nt.empty() && is4nt == "4nt" ) + { + std::string myperl2 = replaceAll( myperl , "\\" , "\\\\" ); + myperl = myperl2 ; + myperl2 = replaceAll( perlfile , "\\" , "\\\\" ); + perlfile = myperl2 ; + } + commandline = myperl + " " + perlfile; + + HCDBG(std::cerr << "command line 3 is" << commandline << std::endl); + system(commandline.c_str()); + + fs::remove(tmp); +} + +void HelpLinker::addBookmark( DB* dbBase, std::string thishid, + const std::string& fileB, const std::string& anchorB, + const std::string& jarfileB, const std::string& titleB) +{ + HCDBG(std::cerr << "HelpLinker::addBookmark " << thishid << " " << + fileB << " " << anchorB << " " << jarfileB << " " << titleB << std::endl); + + std::string temp = thishid; + std::transform (temp.begin(), temp.end(), temp.begin(), toupper); + std::replace(temp.begin(), temp.end(), ':', '_'); + const std::string& translatedHid = hidlistTranslation[temp]; + if (!translatedHid.empty()) + thishid = translatedHid; + + thishid = URLEncoder::encode(thishid); + + DBT key; + memset(&key, 0, sizeof(key)); + key.data = const_cast(thishid.c_str()); + key.size = thishid.length(); + + int fileLen = fileB.length(); + if (!anchorB.empty()) + fileLen += (1 + anchorB.length()); + int dataLen = 1 + fileLen + 1 + jarfileB.length() + 1 + titleB.length(); + + std::vector dataB(dataLen); + size_t i = 0; + dataB[i++] = static_cast(fileLen); + for (size_t j = 0; j < fileB.length(); ++j) + dataB[i++] = fileB[j]; + if (!anchorB.empty()) + { + dataB[i++] = '#'; + for (size_t j = 0; j < anchorB.length(); ++j) + dataB[i++] = anchorB[j]; + } + dataB[i++] = static_cast(jarfileB.length()); + for (size_t j = 0; j < jarfileB.length(); ++j) + dataB[i++] = jarfileB[j]; + + dataB[i++] = static_cast(titleB.length()); + for (size_t j = 0; j < titleB.length(); ++j) + dataB[i++] = titleB[j]; + + DBT data; + memset(&data, 0, sizeof(data)); + data.data = &dataB[0]; + data.size = dataB.size(); + + dbBase->put(dbBase, NULL, &key, &data, 0); +} + +void HelpLinker::createFileFromBytes(const std::string &fileName, + const std::string &defaultXSL) +{ + std::ofstream fos((indexDirParentName / fileName).native_file_string().c_str()); + fos << defaultXSL; +} + +void HelpLinker::initXMLIndexBuilder() +{ + std::string mod = module; + std::transform (mod.begin(), mod.end(), mod.begin(), tolower); + indexDirName = indexDirParentName / (mod + ".idx"); + fs::create_directory(indexDirName); + + if (xmlIndexBuilder) delete xmlIndexBuilder; + xmlIndexBuilder = new XmlIndexBuilder(indexDirName); + + std::string defaultXSL = + "\n" + "\n" + "\t\n" + ""; + createFileFromBytes("default.xsl", defaultXSL); + xmlIndexBuilder->clearIndex(); // Build index from scratch + xmlIndexBuilder->setTransformLocation(indexDirParentName); +} + +namespace +{ + fs::path gettmppath() + { + fs::path ret; + osl::File::createTempFile(0, 0, &ret.data); + fs::remove(ret); + return ret; + } +} + +extern "C" void function_orig_pointer(xmlXPathParserContextPtr ctxt, int nargs) +{ + if (nargs > 1) + { + std::cerr << "function_orig_pointer, too many args" << std::endl; + exit(-1); + } + + xmlNodePtr cur = NULL; + if (nargs == 0) + cur = ctxt->context->node; + else if (nargs == 1) + { + xmlXPathObjectPtr obj = valuePop(ctxt); + xmlNodeSetPtr nodelist = obj->nodesetval; + + if ((nodelist == NULL) || (nodelist->nodeNr <= 0)) + { + std::cerr << "function_orig_pointer, bad nodeset" << std::endl; + exit(-1); + } + + cur = nodelist->nodeTab[0]; + for (int i = 1; i < nodelist->nodeNr; ++i) + { + int ret = xmlXPathCmpNodes(cur, nodelist->nodeTab[i]); + if (ret == -1) + cur = nodelist->nodeTab[i]; + } + + xmlXPathFreeObject(obj); + } + + if (cur == NULL) + { + std::cerr << "function_orig_pointer, bad node" << std::endl; + exit(-1); + } + + static xmlChar str[20]; + sprintf((char *)str, "%ld", (sal_uIntPtr)(cur)); + valuePush(ctxt, xmlXPathNewString(str)); +} + +extern "C" void* cmc_module_init(xsltTransformContextPtr ctxt, const xmlChar* uri) +{ + if (xsltRegisterExtFunction(ctxt, (const xmlChar*)"orig-pointer", uri, function_orig_pointer)) + { + std::cerr << "failure to register function_orig_pointer" << std::endl; + exit(-1); + } + return NULL; +} + +extern "C" void cmc_module_term(xsltTransformContextPtr, const xmlChar*, void*) +{ +} + +/** +* +*/ +void HelpLinker::link() +{ + indexDirParentName = gettmppath(); + fs::create_directory(indexDirParentName); + +#ifdef CMC_DEBUG + std::cerr << "will not delete tmpdir of " << indexDirParentName.native_file_string().c_str() << std::endl; +#endif + + std::string mod = module; + std::transform (mod.begin(), mod.end(), mod.begin(), tolower); + + // Determine the outputstream + fs::path outputTmpFile(outputFile); + outputTmpFile.append(".tmp"); + jarOutputStream.setname(outputTmpFile); + + // do the work here + // continue with introduction of the overall process thing into the + // here all hzip files will be worked on + std::string appl = mod; + if (appl[0] == 's') + appl = appl.substr(1); + + fs::path helpTextFileName(indexDirParentName / (mod + ".ht")); + DB* helpText; + db_create(&helpText,0,0); + helpText->open(helpText, NULL, helpTextFileName.native_file_string().c_str(), NULL, DB_BTREE, + DB_CREATE | DB_TRUNCATE, 0644); + + fs::path dbBaseFileName(indexDirParentName / (mod + ".db")); + DB* dbBase; + db_create(&dbBase,0,0); + dbBase->open(dbBase, NULL, dbBaseFileName.native_file_string().c_str(), NULL, DB_BTREE, + DB_CREATE | DB_TRUNCATE, 0644); + + fs::path keyWordFileName(indexDirParentName / (mod + ".key")); + DB* keyWord; + db_create(&keyWord,0,0); + keyWord->open(keyWord, NULL, keyWordFileName.native_file_string().c_str(), NULL, DB_BTREE, + DB_CREATE | DB_TRUNCATE, 0644); + + HelpKeyword helpKeyword; + + std::ifstream fileReader(hid.c_str()); + while (fileReader) + { + std::string key; + fileReader >> key; + std::transform (key.begin(), key.end(), key.begin(), toupper); + std::replace(key.begin(), key.end(), ':', '_'); + std::string data; + fileReader >> data; + if (!key.empty() && !data.empty()) + hidlistTranslation[key] = data; + } + + // lastly, initialize the indexBuilder + if (!helpFiles.empty()) + initXMLIndexBuilder(); + + std::cout << "Making " << outputFile.native_file_string() << + " from " << helpFiles.size() << " input files" << std::endl; + + // here we start our loop over the hzip files. + HashSet::iterator end = helpFiles.end(); + for (HashSet::iterator iter = helpFiles.begin(); iter != end; ++iter) + { + std::cout << "."; + std::cout.flush(); + // process one file + // streamTable contains the streams in the hzip file + StreamTable streamTable; + const std::string &xhpFileName = *iter; + + if (xhpFileName.rfind(".xhp") != xhpFileName.length()-4) + { + // only work on .xhp - files + std::cerr << + "ERROR: input list entry '" + << xhpFileName + << "' has the wrong extension (only files with extension .xhp " + << "are accepted)"; + continue; + } + + fs::path langsourceRoot(sourceRoot); + langsourceRoot.append('/' + lang + '/'); + fs::path xhpFile(xhpFileName, fs::native); + HelpCompiler hc( + streamTable, + xhpFile, + langsourceRoot, + embeddStylesheet, + module, + lang); + + HCDBG(std::cerr << "before compile of " << xhpFileName << std::endl); + bool success = hc.compile(); + HCDBG(std::cerr << "after compile of " << xhpFileName << std::endl); + if (!success) + { + std::cerr << + "\nERROR: compiling help particle '" + << xhpFileName + << "' for language '" + << lang + << "' failed!"; + exit(1); + } + + const std::string documentBaseId = streamTable.document_id; + std::string documentPath = streamTable.document_path; + if (documentPath.find("/") == 0) + documentPath = documentPath.substr(1); + + std::string documentJarfile = streamTable.document_module + ".jar"; + + std::string documentTitle = streamTable.document_title; + if (documentTitle.empty()) + documentTitle = ""; + +#if 0 + std::cout << "for " << xhpFileName << " documentBaseId is " << documentBaseId << "\n"; + std::cout << "for " << xhpFileName << " documentPath is " << documentPath << "\n"; + std::cout << "for " << xhpFileName << " documentJarfile is " << documentJarfile << "\n"; + std::cout << "for " << xhpFileName << " documentPath is " << documentTitle << "\n"; +#endif + + const std::string& fileB = documentPath; + const std::string& jarfileB = documentJarfile; + std::string& titleB = documentTitle; + + // add once this as its own id. + addBookmark(dbBase, documentPath, fileB, std::string(), jarfileB, titleB); + + if (init) + { + std::ifstream indexXSLFile(indexStylesheet.native_file_string().c_str()); + std::ostringstream baos; + baos << indexXSLFile.rdbuf(); + std::string xsl = baos.str(); + + //I see that we later generate a map of generateids to nodes which we will use + //to link the results of generate-id in the transformed document back to the nodes + //in the original document, so let's cut out the middle-men and make an extension + //which does exactly what we want, and give us a pointer to the original node + xsl.replace(xsl.find("init("index"); + init = false; + } + + // first the database *.db + // ByteArrayInputStream bais = null; + // ObjectInputStream ois = null; + + const HashSet *hidlist = streamTable.appl_hidlist; + if (!hidlist) + hidlist = streamTable.default_hidlist; + if (hidlist && !hidlist->empty()) + { + // now iterate over all elements of the hidlist + HashSet::const_iterator aEnd = hidlist->end(); + for (HashSet::const_iterator hidListIter = hidlist->begin(); + hidListIter != aEnd; ++hidListIter) + { + std::string thishid = *hidListIter; + + std::string anchorB; + size_t index = thishid.rfind('#'); + if (index != std::string::npos) + { + anchorB = thishid.substr(1 + index); + thishid = thishid.substr(0, index); + } + addBookmark(dbBase, thishid, fileB, anchorB, jarfileB, titleB); + } + } + + // now the keywords + const Hashtable *anchorToLL = streamTable.appl_keywords; + if (!anchorToLL) + anchorToLL = streamTable.default_keywords; + if (anchorToLL && !anchorToLL->empty()) + { + std::string fakedHid = URLEncoder::encode(documentPath); + Hashtable::const_iterator aEnd = anchorToLL->end(); + for (Hashtable::const_iterator enumer = anchorToLL->begin(); + enumer != aEnd; ++enumer) + { + const std::string &anchor = enumer->first; + addBookmark(dbBase, documentPath, fileB, + anchor, jarfileB, titleB); + std::string totalId = fakedHid + "#" + anchor; + // std::cerr << hzipFileName << std::endl; + const LinkedList& ll = enumer->second; + LinkedList::const_iterator aOtherEnd = ll.end(); + for (LinkedList::const_iterator llIter = ll.begin(); + llIter != aOtherEnd; ++llIter) + { + helpKeyword.insert(*llIter, totalId); + } + } + + } + + // and last the helptexts + const Stringtable *helpTextHash = streamTable.appl_helptexts; + if (!helpTextHash) + helpTextHash = streamTable.default_helptexts; + if (helpTextHash && !helpTextHash->empty()) + { + Stringtable::const_iterator aEnd = helpTextHash->end(); + for (Stringtable::const_iterator helpTextIter = helpTextHash->begin(); + helpTextIter != aEnd; ++helpTextIter) + { + std::string helpTextId = helpTextIter->first; + const std::string& helpTextText = helpTextIter->second; + + std::string temp = helpTextId; + std::transform (temp.begin(), temp.end(), temp.begin(), toupper); + std::replace(temp.begin(), temp.end(), ':', '_'); + + const std::string& tHid = hidlistTranslation[temp]; + if (!tHid.empty()) + helpTextId = tHid; + helpTextId = URLEncoder::encode(helpTextId); + + DBT keyDbt; + memset(&keyDbt, 0, sizeof(keyDbt)); + keyDbt.data = const_cast(helpTextId.c_str()); + keyDbt.size = helpTextId.length(); + + DBT textDbt; + memset(&textDbt, 0, sizeof(textDbt)); + textDbt.data = const_cast(helpTextText.c_str()); + textDbt.size = helpTextText.length(); + helpText->put(helpText, NULL, &keyDbt, &textDbt, 0); + } + } + + // now the indexing + xmlDocPtr document = streamTable.appl_doc; + if (!document) + document = streamTable.default_doc; + if (document) + { + std::string temp = module; + std::transform (temp.begin(), temp.end(), temp.begin(), tolower); + xmlIndexBuilder->indexDocument(document, + std::string("vnd.sun.star.help://") + + temp + + "/" + + URLEncoder::encode(documentPath), + ""); + } + } // while loop over hzip files ending + + std::cout << std::endl; + + helpText->close(helpText, 0); + dbBase->close(dbBase, 0); + helpKeyword.dump(keyWord); + keyWord->close(keyWord, 0); + + if (!helpFiles.empty()) + { + closeXMLIndexBuilder(); + HCDBG(std::cerr << "dir is " << indexDirName.native_directory_string() << std::endl); + jarOutputStream.addTree(indexDirName.native_file_string(), mod + ".idx"); + } + + jarOutputStream.addFile(helpTextFileName.native_file_string(), mod + ".ht"); + jarOutputStream.addFile(dbBaseFileName.native_file_string(), mod + ".db"); + jarOutputStream.addFile(keyWordFileName.native_file_string(), mod + ".key"); + + ///////////////////////////////////////////////////////////////////////// + // last, all files which should be copied into the jarFile + ///////////////////////////////////////////////////////////////////////// + + Stringtable::iterator aEnd = additionalFiles.end(); + for (Stringtable::iterator enumer = additionalFiles.begin(); enumer != aEnd; + ++enumer) + { + const std::string &additionalFileKey = enumer->first; + const std::string &additionalFileName = enumer->second; + jarOutputStream.addFile(additionalFileName, additionalFileKey); + } + + jarOutputStream.dontCompress(mod + ".jar"); + jarOutputStream.commit(); + + HCDBG(std::cerr << "like to rename " << outputTmpFile.native_file_string() << " as " << + outputFile.native_file_string() << std::endl); + fs::rename(outputTmpFile, outputFile); + if (!fs::exists(outputFile)) + { + std::cerr << "can't rename file '" << outputTmpFile.native_file_string() << "'" << std::endl; + exit(1); + } + + ///////////////////////////////////////////////////////////////////////// + /// remove temprary directory for index creation + ///////////////////////////////////////////////////////////////////////// +#ifndef CMC_DEBUG + fs::remove_all( indexDirParentName ); +#endif +} + + +int HelpLinker::locCount; +int HelpLinker::totCount; +Stringtable HelpLinker::additionalFiles; +HashSet HelpLinker::helpFiles; +fs::path HelpLinker::sourceRoot; +fs::path HelpLinker::embeddStylesheet, HelpLinker::indexStylesheet; +fs::path HelpLinker::outputFile; +std::string HelpLinker::module; +std::string HelpLinker::lang; +std::string HelpLinker::hid; + +void HelpLinker::main(std::vector &args) +{ + if (args.size() > 0 && args[0][0] == '@') + { + std::vector stringList; + std::string strBuf; + std::ifstream fileReader(args[0].substr(1).c_str()); + + while (fileReader) + { + std::string token; + fileReader >> token; + if (!token.empty()) + stringList.push_back(token); + } + + args = stringList; + } + + size_t i = 0; + + while (i < args.size()) + { + if (args[i].compare("-src") == 0) + { + ++i; + if (i >= args.size()) + { + std::cerr << "sourceroot missing" << std::endl; + exit(1); + } + + sourceRoot = fs::path(args[i], fs::native); + } + else if (args[i].compare("-sty") == 0) + { + ++i; + if (i >= args.size()) + { + std::cerr << "embeddingStylesheet missing" << std::endl; + exit(1); + } + + embeddStylesheet = fs::path(args[i], fs::native); + } + else if (args[i].compare("-idx") == 0) + { + ++i; + if (i >= args.size()) + { + std::cerr << "indexstylesheet missing" << std::endl; + exit(1); + } + + indexStylesheet = fs::path(args[i], fs::native); + } + else if (args[i].compare("-o") == 0) + { + ++i; + if (i >= args.size()) + { + std::cerr << "outputfilename missing" << std::endl; + exit(1); + } + + outputFile = fs::path(args[i], fs::native); + } + else if (args[i].compare("-mod") == 0) + { + ++i; + if (i >= args.size()) + { + std::cerr << "module name missing" << std::endl; + exit(1); + } + + module = args[i]; + } + else if (args[i].compare("-lang") == 0) + { + ++i; + if (i >= args.size()) + { + std::cerr << "language name missing" << std::endl; + exit(1); + } + + lang = args[i]; + } + else if (args[i].compare("-hid") == 0) + { + ++i; + if (i >= args.size()) + { + std::cerr << "hid list missing" << std::endl; + exit(1); + } + + hid = args[i]; + } + else if (args[i].compare("-add") == 0) + { + std::string addFile, addFileUnderPath; + ++i; + if (i >= args.size()) + { + std::cerr << "pathname missing" << std::endl; + exit(1); + } + + addFileUnderPath = args[i]; + ++i; + if (i >= args.size()) + { + std::cerr << "pathname missing" << std::endl; + exit(1); + } + addFile = args[i]; + if (!addFileUnderPath.empty() && !addFile.empty()) + additionalFiles[addFileUnderPath] = addFile; + } + else + helpFiles.push_back(args[i]); + ++i; + } + + if (indexStylesheet.empty()) + { + std::cerr << "no index file given" << std::endl; + exit(1); + } + if (embeddStylesheet.empty()) + { + std::cerr << "no embedding resolving file given" << std::endl; + exit(1); + } + if (sourceRoot.empty()) + { + std::cerr << "no sourceroot given" << std::endl; + exit(1); + } + if (outputFile.empty()) + { + std::cerr << "no output file given" << std::endl; + exit(1); + } + if (module.empty()) + { + std::cerr << "module missing" << std::endl; + exit(1); + } + if (lang.empty()) + { + std::cerr << "language missing" << std::endl; + exit(1); + } + if (hid.empty()) + { + std::cerr << "hid list missing" << std::endl; + exit(1); + } + + HelpLinker().link(); +} + +int main(int argc, char**argv) +{ + sal_uInt32 starttime = osl_getGlobalTimer(); + std::vector args; + for (int i = 1; i < argc; ++i) + args.push_back(std::string(argv[i])); + HelpLinker::main(args); + sal_uInt32 endtime = osl_getGlobalTimer(); + std::cout << "time taken was " << (endtime-starttime)/1000.0 << " seconds" << std::endl; + return 0; +} + +// vnd.sun.star.help://swriter/52821?Language=en-US&System=UNIX +/* vi:set tabstop=4 shiftwidth=4 expandtab: */ Index: util/xmlhelp/source/com/sun/star/help/makefile.mk diff -u util/xmlhelp/source/com/sun/star/help/makefile.mk:1.25 util/xmlhelp/source/com/sun/star/help/makefile.mk:1.25.12.3 --- util/xmlhelp/source/com/sun/star/help/makefile.mk:1.25 Mon Jan 15 04:38:09 2007 +++ util/xmlhelp/source/com/sun/star/help/makefile.mk Sat Jun 9 06:14:05 2007 @@ -35,103 +35,41 @@ PRJ = ..$/..$/..$/..$/.. PRJNAME = xmlhelp -PACKAGE = com$/sun$/star$/help -TARGET = com_sun_star_help +TARGET = HelpLinker +TARGETTYPE=CUI +LIBTARGET=no # --- Settings ----------------------------------------------------- .INCLUDE : settings.mk -JARFILES = xt-xmlsearch.jar unoil.jar ridl.jar jurt.jar jut.jar xmlsearch.jar -EXTRAJARFILES = - -.IF "$(SYSTEM_XT)" == "YES" -XCLASSPATH!:=$(XCLASSPATH)$(PATH_SEPERATOR)$(XT_JAR) +.IF "$(SYSTEM_LIBXSLT)" == "YES" +CFLAGS+= $(LIBXSLT_CFLAGS) .ELSE -JARFILES += xt.jar +LIBXSLTINCDIR=external$/libxslt +CFLAGS+= -I$(SOLARINCDIR)$/$(LIBXSLTINCDIR) .ENDIF -.IF "$(SYSTEM_XML_APIS)" == "YES" -.IF "$(XCLASSPATH)" != "" -XCLASSPATH!:=$(XCLASSPATH)$(PATH_SEPERATOR)$(XML_APIS_JAR) -.ELSE -XCLASSPATH!:=$(XML_APIS_JAR) -.ENDIF -.ELSE -JARFILES += xml-apis.jar +.IF "$(SYSTEM_DB)" == "YES" +CFLAGS+=-DSYSTEM_DB -I$(DB_INCLUDES) .ENDIF -.IF "$(SYSTEM_XERCES)" == "YES" -.IF "$(XCLASSPATH)" != "" -XCLASSPATH!:=$(XCLASSPATH)$(PATH_SEPERATOR)$(XERCES_JAR) -.ELSE -XCLASSPATH!:=$(XERCES_JAR) -.ENDIF -.ELSE -JARFILES += xercesImpl.jar -.ENDIF +OBJFILES=\ + $(OBJ)$/HelpLinker.obj \ + $(OBJ)$/HelpCompiler.obj -.IF "$(SYSTEM_DB)" == "YES" -.IF "$(XCLASSPATH)" != "" -XCLASSPATH!:=$(XCLASSPATH)$(PATH_SEPERATOR)$(DB_JAR) -.ELSE -XCLASSPATH!:=$(DB_JAR) -.ENDIF -.ELSE -JARFILES += db.jar -.ENDIF +EXCEPTIONSFILES=\ + $(OBJ)$/HelpLinker.obj \ + $(OBJ)$/HelpCompiler.obj -CLASSGENDIR = $(OUT)$/classgen -RDB = $(SOLARBINDIR)$/types.rdb -JAVAFILES = $(subst,$(CLASSDIR)$/$(PACKAGE)$/, $(subst,.class,.java $(JAVACLASSFILES))) - -# --- Files -------------------------------------------------------- - -JAVACLASSFILES = \ - $(CLASSDIR)$/$(PACKAGE)$/HelpCompiler.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpLinker.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpContentIdentifier.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpProvider.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpContent.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpOutputStream.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpDatabases.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpURLStreamHandlerFactory.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpURLStreamHandler.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpURLStreamHandlerWithJars.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpURLConnection.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpURLConnectionWithJars.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpURLParameter.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpResultSetFactory.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpDynamicResultSet.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpResultSetBase.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpResultSet.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpResultSetForRoot.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpIndexer.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpKeyword.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpPackager.class \ - $(CLASSDIR)$/$(PACKAGE)$/XSLData.class \ - $(CLASSDIR)$/$(PACKAGE)$/MemoryURLConnection.class \ - $(CLASSDIR)$/$(PACKAGE)$/StringDbt.class - -.IF "$(JDK)"=="gcj" -JAVACLASSFILES += \ - $(CLASSDIR)$/$(PACKAGE)$/GCJFileURLStreamHandler.class -.ELSE -JAVACLASSFILES += \ - $(CLASSDIR)$/$(PACKAGE)$/FileURLStreamHandler.class -.ENDIF -JARCLASSDIRS = com -JARTARGET = $(PRJNAME).jar -JARCOMPRESS = TRUE -CUSTOMMANIFESTFILE = manifest +APP1TARGET= $(TARGET) +APP1OBJS=\ + $(OBJ)$/HelpLinker.obj \ + $(OBJ)$/HelpCompiler.obj + +APP1STDLIBS+=$(SALLIB) $(BERKELEYLIB) $(ICUUCLIB) $(XSLTLIB) # --- Targets ------------------------------------------------------ .INCLUDE : target.mk - -.IF "$(JAVAAOTCOMPILER)" != "" -AOTTARGET = com.sun.star.help.HelpLinker -.INCLUDE : aottarget.mk -ALLTAR : $(AOTTARGETN) -.ENDIF Index: external/icu/icu-3.6.patch diff -u external/icu/icu-3.6.patch:1.5 external/icu/icu-3.6.patch:1.5.10.1 --- external/icu/icu-3.6.patch:1.5 Tue Jan 30 00:17:59 2007 +++ external/icu/icu-3.6.patch Sun May 20 06:09:32 2007 @@ -582,3 +582,107 @@ #include "ustrtest.h" #include "unicode/unistr.h" #include "unicode/uchar.h" +*** misc/icu/source/common/unicode/unistr.h 2007-05-20 08:40:31.000000000 -0400 +--- misc/build/icu/source/common/unicode/unistr.h 2007-05-20 08:42:01.000000000 -0400 +*************** +*** 3280,3286 **** + //======================================== + inline int8_t + UnicodeString::doCompare(int32_t start, +! int32_t length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const +--- 3280,3286 ---- + //======================================== + inline int8_t + UnicodeString::doCompare(int32_t start, +! int32_t _length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const +*************** +*** 3289,3295 **** + return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise + } else { + srcText.pinIndices(srcStart, srcLength); +! return doCompare(start, length, srcText.fArray, srcStart, srcLength); + } + } + +--- 3289,3295 ---- + return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise + } else { + srcText.pinIndices(srcStart, srcLength); +! return doCompare(start, _length, srcText.fArray, srcStart, srcLength); + } + } + +*************** +*** 3374,3380 **** + + inline int8_t + UnicodeString::doCompareCodePointOrder(int32_t start, +! int32_t length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const +--- 3374,3380 ---- + + inline int8_t + UnicodeString::doCompareCodePointOrder(int32_t start, +! int32_t _length, + const UnicodeString& srcText, + int32_t srcStart, + int32_t srcLength) const +*************** +*** 3383,3389 **** + return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise + } else { + srcText.pinIndices(srcStart, srcLength); +! return doCompareCodePointOrder(start, length, srcText.fArray, srcStart, srcLength); + } + } + +--- 3383,3389 ---- + return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise + } else { + srcText.pinIndices(srcStart, srcLength); +! return doCompareCodePointOrder(start, _length, srcText.fArray, srcStart, srcLength); + } + } + +*************** +*** 3435,3441 **** + + inline int8_t + UnicodeString::doCaseCompare(int32_t start, +! int32_t length, + const UnicodeString &srcText, + int32_t srcStart, + int32_t srcLength, +--- 3435,3441 ---- + + inline int8_t + UnicodeString::doCaseCompare(int32_t start, +! int32_t _length, + const UnicodeString &srcText, + int32_t srcStart, + int32_t srcLength, +*************** +*** 3445,3451 **** + return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise + } else { + srcText.pinIndices(srcStart, srcLength); +! return doCaseCompare(start, length, srcText.fArray, srcStart, srcLength, options); + } + } + +--- 3445,3451 ---- + return (int8_t)!isBogus(); // 0 if both are bogus, 1 otherwise + } else { + srcText.pinIndices(srcStart, srcLength); +! return doCaseCompare(start, _length, srcText.fArray, srcStart, srcLength, options); + } + } +