diff -r e6b939051504 Makefile.in --- a/Makefile.in Tue Apr 01 12:51:18 2008 +0800 +++ b/Makefile.in Mon Apr 07 11:02:46 2008 +0800 @@ -1,8 +1,8 @@ -# Makefile.in generated by automake 1.9.2 from Makefile.am. +# Makefile.in generated by automake 1.9.6 from Makefile.am. # @configure_input@ # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, -# 2003, 2004 Free Software Foundation, Inc. +# 2003, 2004, 2005 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. @@ -118,6 +118,8 @@ MAINTAINER_MODE_FALSE = @MAINTAINER_MODE MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@ MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@ MAKEINFO = @MAKEINFO@ +MMSEG_CFLAGS = @MMSEG_CFLAGS@ +MMSEG_LIBS = @MMSEG_LIBS@ MYSQL_CFLAGS = @MYSQL_CFLAGS@ MYSQL_LIBS = @MYSQL_LIBS@ OBJEXT = @OBJEXT@ @@ -136,6 +138,8 @@ STRIP = @STRIP@ STRIP = @STRIP@ USE_LIBSTEMMER_FALSE = @USE_LIBSTEMMER_FALSE@ USE_LIBSTEMMER_TRUE = @USE_LIBSTEMMER_TRUE@ +USE_MMSEG_FALSE = @USE_MMSEG_FALSE@ +USE_MMSEG_TRUE = @USE_MMSEG_TRUE@ USE_MYSQL_FALSE = @USE_MYSQL_FALSE@ USE_MYSQL_TRUE = @USE_MYSQL_TRUE@ USE_PGSQL_FALSE = @USE_PGSQL_FALSE@ @@ -260,7 +264,13 @@ uninstall-sysconfDATA: # (which will cause the Makefiles to be regenerated when you run `make'); # (2) otherwise, pass the desired values on the `make' command line. $(RECURSIVE_TARGETS): - @set fnord $$MAKEFLAGS; amf=$$2; \ + @failcom='exit 1'; \ + for f in x $$MAKEFLAGS; do \ + case $$f in \ + *=* | --[!k]*);; \ + *k*) failcom='fail=yes';; \ + esac; \ + done; \ dot_seen=no; \ target=`echo $@ | sed s/-recursive//`; \ list='$(SUBDIRS)'; for subdir in $$list; do \ @@ -272,7 +282,7 @@ uninstall-sysconfDATA: local_target="$$target"; \ fi; \ (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ - || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \ + || eval $$failcom; \ done; \ if test "$$dot_seen" = "no"; then \ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ @@ -280,7 +290,13 @@ uninstall-sysconfDATA: mostlyclean-recursive clean-recursive distclean-recursive \ maintainer-clean-recursive: - @set fnord $$MAKEFLAGS; amf=$$2; \ + @failcom='exit 1'; \ + for f in x $$MAKEFLAGS; do \ + case $$f in \ + *=* | --[!k]*);; \ + *k*) failcom='fail=yes';; \ + esac; \ + done; \ dot_seen=no; \ case "$@" in \ distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ @@ -301,7 +317,7 @@ maintainer-clean-recursive: local_target="$$target"; \ fi; \ (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ - || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \ + || eval $$failcom; \ done && test -z "$$fail" tags-recursive: list='$(SUBDIRS)'; for subdir in $$list; do \ diff -r e6b939051504 acinclude.m4 --- a/acinclude.m4 Tue Apr 01 12:51:18 2008 +0800 +++ b/acinclude.m4 Mon Apr 07 11:02:46 2008 +0800 @@ -254,6 +254,103 @@ fi ]) dnl --------------------------------------------------------------------------- +dnl Macro: AC_CHECK_MMSEG +dnl First check for custom PostgreSQL paths in --with-pgsql-* options. +dnl If some paths are missing, check if pg_config exists. +dnl --------------------------------------------------------------------------- + +AC_DEFUN([AC_CHECK_MMSEG],[ + +# if there's nothing from mysql_config, check well-known include paths +# explicit overrides will be applied later +if test [ -z "$MMSEG_CFLAGS" ] +then + for CANDIDATE in "$user_mmseg_includes" "/usr/local/include/mmseg" "/usr/include/mmseg" + do + if test [ -n "$CANDIDATE" -a -r "$CANDIDATE/Segmenter.h" ] + then + MMSEG_CFLAGS="-I$CANDIDATE" + break + fi + done +fi + + +# if there's nothing from mysql_config, check well-known library paths +# explicit overrides will be applied later +if test [ -z "$MMSEG_LIBS" ] +then + for CANDIDATE in "$user_mmseg_libs" "/usr/lib64" \ + "/usr/local/lib" "/usr/local/mmseg/lib" \ + "/usr/local/lib/mmseg" "/usr/lib" \ + "/opt/mmseg/lib" + do + if test [ -n "$CANDIDATE" -a -d "$CANDIDATE" ] + then + MMSEG_LIBS="-L$CANDIDATE -lmmseg" + break + fi + done +fi + + + +# apply explicit include path overrides +AC_ARG_WITH([mmseg-includes], + AC_HELP_STRING([--with-mmseg-includes], [path to libmmseg header files]), + [ac_cv_mmseg_includes=$withval]) +if test [ -n "$ac_cv_mmseg_includes" ] +then + MMSEG_CFLAGS="-I$ac_cv_mmseg_includes" +fi + + +# apply explicit lib path overrides +AC_ARG_WITH([mmseg-libs], + AC_HELP_STRING([--with-mmseg-libs], [path to libmmseg libraries]), + [ac_cv_mmseg_libs=$withval]) +if test [ -n "$ac_cv_mmseg_libs" ] +then + # Trim trailing '.libs' if user passed it in --with-mysql-libs option + ac_cv_mmseg_libs=`echo ${ac_cv_mmseg_libs} | sed -e 's/.libs$//' \ + -e 's+.libs/$++'` + MMSEG_LIBS="-L$ac_cv_mmseg_libs -lmmseg" +fi + +# now that we did all we could, perform final checks +AC_MSG_CHECKING([libmmseg include files]) +if test [ -z "$MMSEG_CFLAGS" ] +then + AC_MSG_ERROR([missing include files. + +****************************************************************************** +ERROR: cannot find libmmseg include files. + +To disable libmmseg support, use --without-mmseg option. +****************************************************************************** +]) +else + AC_MSG_RESULT([$MMSEG_CFLAGS]) +fi + +AC_MSG_CHECKING([libmmseg libraries]) +if test [ -z "$MMSEG_LIBS" ] +then + AC_MSG_ERROR([missing libraries. + +****************************************************************************** +ERROR: cannot find libmmseg libraries. + +To disable libmmseg support, use --without-mmseg option. +****************************************************************************** +]) +else + AC_MSG_RESULT([$MMSEG_LIBS]) +fi + +]) + +dnl --------------------------------------------------------------------------- dnl Macro: SPHINX_CONFIGURE_PART dnl dnl Tells what stage is ./configure running now, nicely formatted diff -r e6b939051504 config/config.h.in --- a/config/config.h.in Tue Apr 01 12:51:18 2008 +0800 +++ b/config/config.h.in Mon Apr 07 11:02:46 2008 +0800 @@ -199,6 +199,9 @@ /* libstemmer support */ #undef USE_LIBSTEMMER +/* Define to 1 if you want to compile with libmmseg support */ +#undef USE_MMSEG + /* Define to 1 if you want to compile with MySQL support */ #undef USE_MYSQL diff -r e6b939051504 configure --- a/configure Tue Apr 01 12:51:18 2008 +0800 +++ b/configure Mon Apr 07 11:02:46 2008 +0800 @@ -311,7 +311,7 @@ ac_includes_default="\ # include #endif" -ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA CYGPATH_W PACKAGE VERSION ACLOCAL AUTOCONF AUTOMAKE AUTOHEADER MAKEINFO install_sh STRIP ac_ct_STRIP INSTALL_STRIP_PROGRAM mkdir_p AWK SET_MAKE am__leading_dot AMTAR am__tar am__untar MAINTAINER_MODE_TRUE MAINTAINER_MODE_FALSE MAINT CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT DEPDIR am__include am__quote AMDEP_TRUE AMDEP_FALSE AMDEPBACKSLASH CCDEPMODE am__fastdepCC_TRUE am__fastdepCC_FALSE CXX CXXFLAGS ac_ct_CXX CXXDEPMODE am__fastdepCXX_TRUE am__fastdepCXX_FALSE RANLIB ac_ct_RANLIB CPP EGREP LIBOBJS MYSQL_LIBS MYSQL_CFLAGS USE_MYSQL_TRUE USE_MYSQL_FALSE pgconfig PGSQL_LIBS PGSQL_CFLAGS USE_PGSQL_TRUE USE_PGSQL_FALSE USE_LIBSTEMMER_TRUE USE_LIBSTEMMER_FALSE CONFDIR LTLIBOBJS' +ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA CYGPATH_W PACKAGE VERSION ACLOCAL AUTOCONF AUTOMAKE AUTOHEADER MAKEINFO install_sh STRIP ac_ct_STRIP INSTALL_STRIP_PROGRAM mkdir_p AWK SET_MAKE am__leading_dot AMTAR am__tar am__untar MAINTAINER_MODE_TRUE MAINTAINER_MODE_FALSE MAINT CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT DEPDIR am__include am__quote AMDEP_TRUE AMDEP_FALSE AMDEPBACKSLASH CCDEPMODE am__fastdepCC_TRUE am__fastdepCC_FALSE CXX CXXFLAGS ac_ct_CXX CXXDEPMODE am__fastdepCXX_TRUE am__fastdepCXX_FALSE RANLIB ac_ct_RANLIB CPP EGREP LIBOBJS MYSQL_LIBS MYSQL_CFLAGS USE_MYSQL_TRUE USE_MYSQL_FALSE pgconfig PGSQL_LIBS PGSQL_CFLAGS USE_PGSQL_TRUE USE_PGSQL_FALSE MMSEG_LIBS MMSEG_CFLAGS USE_MMSEG_TRUE USE_MMSEG_FALSE USE_LIBSTEMMER_TRUE USE_LIBSTEMMER_FALSE CONFDIR LTLIBOBJS' ac_subst_files='' # Initialize some variables set by options. @@ -874,6 +874,10 @@ Optional Packages: disabled) --with-pgsql-includes path to PostgreSQL header files --with-pgsql-libs path to PostgreSQL libraries + --with-mmseg compile with libmmseg, a mmseg Chinese Segmenter + support (default is enabled) + --with-mmseg-includes path to libmmseg header files + --with-mmseg-libs path to libmmseg libraries --with-libstemmer compile with libstemmer support (default is disabled) @@ -8062,9 +8066,163 @@ fi fi +# check if we should complie with libmmseg (a mmseg Chinese Segmenter) support + +# Check whether --with-mmseg or --without-mmseg was given. +if test "${with_mmseg+set}" = set; then + withval="$with_mmseg" + ac_cv_use_mmseg=$withval +else + ac_cv_use_mmseg=yes + +fi; +echo "$as_me:$LINENO: checking whether to compile with libmmseg support" >&5 +echo $ECHO_N "checking whether to compile with libmmseg support... $ECHO_C" >&6 +if test x$ac_cv_use_mmseg != xno; then + echo "$as_me:$LINENO: result: yes" >&5 +echo "${ECHO_T}yes" >&6 + + +# if there's nothing from mysql_config, check well-known include paths +# explicit overrides will be applied later +if test -z "$MMSEG_CFLAGS" +then + for CANDIDATE in "$user_mmseg_includes" "/usr/local/include/mmseg" "/usr/include/mmseg" + do + if test -n "$CANDIDATE" -a -r "$CANDIDATE/Segmenter.h" + then + MMSEG_CFLAGS="-I$CANDIDATE" + break + fi + done +fi + + +# if there's nothing from mysql_config, check well-known library paths +# explicit overrides will be applied later +if test -z "$MMSEG_LIBS" +then + for CANDIDATE in "$user_mmseg_libs" "/usr/lib64" \ + "/usr/local/lib" "/usr/local/mmseg/lib" \ + "/usr/local/lib/mmseg" "/usr/lib" \ + "/opt/mmseg/lib" + do + if test -n "$CANDIDATE" -a -d "$CANDIDATE" + then + MMSEG_LIBS="-L$CANDIDATE -lmmseg" + break + fi + done +fi + + + +# apply explicit include path overrides + +# Check whether --with-mmseg-includes or --without-mmseg-includes was given. +if test "${with_mmseg_includes+set}" = set; then + withval="$with_mmseg_includes" + ac_cv_mmseg_includes=$withval +fi; +if test -n "$ac_cv_mmseg_includes" +then + MMSEG_CFLAGS="-I$ac_cv_mmseg_includes" +fi + + +# apply explicit lib path overrides + +# Check whether --with-mmseg-libs or --without-mmseg-libs was given. +if test "${with_mmseg_libs+set}" = set; then + withval="$with_mmseg_libs" + ac_cv_mmseg_libs=$withval +fi; +if test -n "$ac_cv_mmseg_libs" +then + # Trim trailing '.libs' if user passed it in --with-mysql-libs option + ac_cv_mmseg_libs=`echo ${ac_cv_mmseg_libs} | sed -e 's/.libs$//' \ + -e 's+.libs/$++'` + MMSEG_LIBS="-L$ac_cv_mmseg_libs -lmmseg" +fi + +# now that we did all we could, perform final checks +echo "$as_me:$LINENO: checking libmmseg include files" >&5 +echo $ECHO_N "checking libmmseg include files... $ECHO_C" >&6 +if test -z "$MMSEG_CFLAGS" +then + { { echo "$as_me:$LINENO: error: missing include files. + +****************************************************************************** +ERROR: cannot find libmmseg include files. + +To disable libmmseg support, use --without-mmseg option. +****************************************************************************** +" >&5 +echo "$as_me: error: missing include files. + +****************************************************************************** +ERROR: cannot find libmmseg include files. + +To disable libmmseg support, use --without-mmseg option. +****************************************************************************** +" >&2;} + { (exit 1); exit 1; }; } +else + echo "$as_me:$LINENO: result: $MMSEG_CFLAGS" >&5 +echo "${ECHO_T}$MMSEG_CFLAGS" >&6 +fi + +echo "$as_me:$LINENO: checking libmmseg libraries" >&5 +echo $ECHO_N "checking libmmseg libraries... $ECHO_C" >&6 +if test -z "$MMSEG_LIBS" +then + { { echo "$as_me:$LINENO: error: missing libraries. + +****************************************************************************** +ERROR: cannot find libmmseg libraries. + +To disable libmmseg support, use --without-mmseg option. +****************************************************************************** +" >&5 +echo "$as_me: error: missing libraries. + +****************************************************************************** +ERROR: cannot find libmmseg libraries. + +To disable libmmseg support, use --without-mmseg option. +****************************************************************************** +" >&2;} + { (exit 1); exit 1; }; } +else + echo "$as_me:$LINENO: result: $MMSEG_LIBS" >&5 +echo "${ECHO_T}$MMSEG_LIBS" >&6 +fi + + + +cat >>confdefs.h <<\_ACEOF +#define USE_MMSEG 1 +_ACEOF + + + +else + echo "$as_me:$LINENO: result: no" >&5 +echo "${ECHO_T}no" >&6 +fi + + +if test x$ac_cv_use_mmseg != xno; then + USE_MMSEG_TRUE= + USE_MMSEG_FALSE='#' +else + USE_MMSEG_TRUE='#' + USE_MMSEG_FALSE= +fi + # we can now set preprocessor flags for both C and C++ compilers -CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS" +CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $MMSEG_CFLAGS" echo "$as_me:$LINENO: checking whether to use 64-bit document/word IDs" >&5 @@ -8504,6 +8662,13 @@ if test -z "${USE_PGSQL_TRUE}" && test - { { echo "$as_me:$LINENO: error: conditional \"USE_PGSQL\" was never defined. Usually this means the macro was only invoked conditionally." >&5 echo "$as_me: error: conditional \"USE_PGSQL\" was never defined. +Usually this means the macro was only invoked conditionally." >&2;} + { (exit 1); exit 1; }; } +fi +if test -z "${USE_MMSEG_TRUE}" && test -z "${USE_MMSEG_FALSE}"; then + { { echo "$as_me:$LINENO: error: conditional \"USE_MMSEG\" was never defined. +Usually this means the macro was only invoked conditionally." >&5 +echo "$as_me: error: conditional \"USE_MMSEG\" was never defined. Usually this means the macro was only invoked conditionally." >&2;} { (exit 1); exit 1; }; } fi @@ -9110,6 +9275,10 @@ s,@PGSQL_CFLAGS@,$PGSQL_CFLAGS,;t t s,@PGSQL_CFLAGS@,$PGSQL_CFLAGS,;t t s,@USE_PGSQL_TRUE@,$USE_PGSQL_TRUE,;t t s,@USE_PGSQL_FALSE@,$USE_PGSQL_FALSE,;t t +s,@MMSEG_LIBS@,$MMSEG_LIBS,;t t +s,@MMSEG_CFLAGS@,$MMSEG_CFLAGS,;t t +s,@USE_MMSEG_TRUE@,$USE_MMSEG_TRUE,;t t +s,@USE_MMSEG_FALSE@,$USE_MMSEG_FALSE,;t t s,@USE_LIBSTEMMER_TRUE@,$USE_LIBSTEMMER_TRUE,;t t s,@USE_LIBSTEMMER_FALSE@,$USE_LIBSTEMMER_FALSE,;t t s,@CONFDIR@,$CONFDIR,;t t diff -r e6b939051504 configure.ac --- a/configure.ac Tue Apr 01 12:51:18 2008 +0800 +++ b/configure.ac Mon Apr 07 11:02:46 2008 +0800 @@ -141,9 +141,26 @@ fi fi AM_CONDITIONAL(USE_PGSQL, test x$ac_cv_use_pgsql != xno) +dnl --- +# check if we should complie with libmmseg (a mmseg Chinese Segmenter) support +AC_ARG_WITH([mmseg], + AC_HELP_STRING([--with-mmseg], [compile with libmmseg, a mmseg Chinese Segmenter support (default is enabled)]), + [ac_cv_use_mmseg=$withval], [ac_cv_use_mmseg=yes] +) +AC_MSG_CHECKING([whether to compile with libmmseg support]) +if test x$ac_cv_use_mmseg != xno; then + AC_MSG_RESULT([yes]) + AC_CHECK_MMSEG([$ac_cv_use_mmseg]) + AC_DEFINE(USE_MMSEG,1,[Define to 1 if you want to compile with libmmseg support]) + AC_SUBST([MMSEG_LIBS]) + AC_SUBST([MMSEG_CFLAGS]) +else + AC_MSG_RESULT([no]) +fi +AM_CONDITIONAL(USE_MMSEG, test x$ac_cv_use_mmseg != xno) # we can now set preprocessor flags for both C and C++ compilers -CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS" +CPPFLAGS="$CPPFLAGS $MYSQL_CFLAGS $PGSQL_CFLAGS $MMSEG_CFLAGS" dnl --- diff -r e6b939051504 libstemmer_c/Makefile.in --- a/libstemmer_c/Makefile.in Tue Apr 01 12:51:18 2008 +0800 +++ b/libstemmer_c/Makefile.in Mon Apr 07 11:02:46 2008 +0800 @@ -1,8 +1,8 @@ -# Makefile.in generated by automake 1.9.2 from Makefile.am. +# Makefile.in generated by automake 1.9.6 from Makefile.am. # @configure_input@ # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, -# 2003, 2004 Free Software Foundation, Inc. +# 2003, 2004, 2005 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. @@ -23,8 +23,6 @@ # german, hungarian, italian, norwegian, porter, portuguese, romanian, # russian, spanish, swedish, turkish - -SOURCES = $(libstemmer_a_SOURCES) srcdir = @srcdir@ top_srcdir = @top_srcdir@ @@ -182,6 +180,8 @@ MAINTAINER_MODE_FALSE = @MAINTAINER_MODE MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@ MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@ MAKEINFO = @MAKEINFO@ +MMSEG_CFLAGS = @MMSEG_CFLAGS@ +MMSEG_LIBS = @MMSEG_LIBS@ MYSQL_CFLAGS = @MYSQL_CFLAGS@ MYSQL_LIBS = @MYSQL_LIBS@ OBJEXT = @OBJEXT@ @@ -200,6 +200,8 @@ STRIP = @STRIP@ STRIP = @STRIP@ USE_LIBSTEMMER_FALSE = @USE_LIBSTEMMER_FALSE@ USE_LIBSTEMMER_TRUE = @USE_LIBSTEMMER_TRUE@ +USE_MMSEG_FALSE = @USE_MMSEG_FALSE@ +USE_MMSEG_TRUE = @USE_MMSEG_TRUE@ USE_MYSQL_FALSE = @USE_MYSQL_FALSE@ USE_MYSQL_TRUE = @USE_MYSQL_TRUE@ USE_PGSQL_FALSE = @USE_PGSQL_FALSE@ diff -r e6b939051504 src/Makefile.am --- a/src/Makefile.am Tue Apr 01 12:51:18 2008 +0800 +++ b/src/Makefile.am Mon Apr 07 11:02:46 2008 +0800 @@ -1,6 +1,6 @@ SRC_SPHINX = sphinx.cpp sphinxexcerpt.cp SRC_SPHINX = sphinx.cpp sphinxexcerpt.cpp sphinxquery.cpp \ sphinxsoundex.cpp sphinxmetaphone.cpp sphinxstemen.cpp sphinxstemru.cpp \ - sphinxutils.cpp md5.cpp sphinxstd.cpp sphinxsort.cpp sphinxexpr.cpp + sphinxutils.cpp md5.cpp sphinxstd.cpp sphinxsort.cpp sphinxexpr.cpp tokenizer_zhcn.cpp noinst_LIBRARIES = libsphinx.a libsphinx_a_SOURCES = $(SRC_SPHINX) @@ -27,7 +27,7 @@ AM_CPPFLAGS = -DSYSCONFDIR="\"$(sysconfd AM_CPPFLAGS = -DSYSCONFDIR="\"$(sysconfdir)\"" endif -COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) +COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(MMSEG_LIBS) indexer_LDADD = $(COMMON_LIBS) searchd_LDADD = $(COMMON_LIBS) -lz search_LDADD = $(COMMON_LIBS) diff -r e6b939051504 src/Makefile.in --- a/src/Makefile.in Tue Apr 01 12:51:18 2008 +0800 +++ b/src/Makefile.in Mon Apr 07 11:02:46 2008 +0800 @@ -1,8 +1,8 @@ -# Makefile.in generated by automake 1.9.2 from Makefile.am. +# Makefile.in generated by automake 1.9.6 from Makefile.am. # @configure_input@ # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, -# 2003, 2004 Free Software Foundation, Inc. +# 2003, 2004, 2005 Free Software Foundation, Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. @@ -14,8 +14,6 @@ @SET_MAKE@ - -SOURCES = $(libsphinx_a_SOURCES) $(indexer_SOURCES) $(search_SOURCES) $(searchd_SOURCES) $(spelldump_SOURCES) $(tests_SOURCES) srcdir = @srcdir@ top_srcdir = @top_srcdir@ @@ -59,7 +57,8 @@ am__objects_1 = sphinx.$(OBJEXT) sphinxe sphinxquery.$(OBJEXT) sphinxsoundex.$(OBJEXT) \ sphinxmetaphone.$(OBJEXT) sphinxstemen.$(OBJEXT) \ sphinxstemru.$(OBJEXT) sphinxutils.$(OBJEXT) md5.$(OBJEXT) \ - sphinxstd.$(OBJEXT) sphinxsort.$(OBJEXT) sphinxexpr.$(OBJEXT) + sphinxstd.$(OBJEXT) sphinxsort.$(OBJEXT) sphinxexpr.$(OBJEXT) \ + tokenizer_zhcn.$(OBJEXT) am_libsphinx_a_OBJECTS = $(am__objects_1) libsphinx_a_OBJECTS = $(am_libsphinx_a_OBJECTS) am__installdirs = "$(DESTDIR)$(bindir)" @@ -71,7 +70,8 @@ indexer_OBJECTS = $(am_indexer_OBJECTS) @USE_LIBSTEMMER_TRUE@ $(top_srcdir)/libstemmer_c/libstemmer.a am__DEPENDENCIES_2 = am__DEPENDENCIES_3 = libsphinx.a $(am__DEPENDENCIES_1) \ - $(am__DEPENDENCIES_2) $(am__DEPENDENCIES_2) + $(am__DEPENDENCIES_2) $(am__DEPENDENCIES_2) \ + $(am__DEPENDENCIES_2) indexer_DEPENDENCIES = $(am__DEPENDENCIES_3) am_search_OBJECTS = search.$(OBJEXT) search_OBJECTS = $(am_search_OBJECTS) @@ -138,6 +138,8 @@ MAINTAINER_MODE_FALSE = @MAINTAINER_MODE MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@ MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@ MAKEINFO = @MAKEINFO@ +MMSEG_CFLAGS = @MMSEG_CFLAGS@ +MMSEG_LIBS = @MMSEG_LIBS@ MYSQL_CFLAGS = @MYSQL_CFLAGS@ MYSQL_LIBS = @MYSQL_LIBS@ OBJEXT = @OBJEXT@ @@ -156,6 +158,8 @@ STRIP = @STRIP@ STRIP = @STRIP@ USE_LIBSTEMMER_FALSE = @USE_LIBSTEMMER_FALSE@ USE_LIBSTEMMER_TRUE = @USE_LIBSTEMMER_TRUE@ +USE_MMSEG_FALSE = @USE_MMSEG_FALSE@ +USE_MMSEG_TRUE = @USE_MMSEG_TRUE@ USE_MYSQL_FALSE = @USE_MYSQL_FALSE@ USE_MYSQL_TRUE = @USE_MYSQL_TRUE@ USE_PGSQL_FALSE = @USE_PGSQL_FALSE@ @@ -197,7 +201,7 @@ target_alias = @target_alias@ target_alias = @target_alias@ SRC_SPHINX = sphinx.cpp sphinxexcerpt.cpp sphinxquery.cpp \ sphinxsoundex.cpp sphinxmetaphone.cpp sphinxstemen.cpp sphinxstemru.cpp \ - sphinxutils.cpp md5.cpp sphinxstd.cpp sphinxsort.cpp sphinxexpr.cpp + sphinxutils.cpp md5.cpp sphinxstd.cpp sphinxsort.cpp sphinxexpr.cpp tokenizer_zhcn.cpp noinst_LIBRARIES = libsphinx.a libsphinx_a_SOURCES = $(SRC_SPHINX) @@ -211,7 +215,7 @@ BUILT_SOURCES = extract-version @USE_LIBSTEMMER_TRUE@LIBSTEMMER_LIBS = $(top_srcdir)/libstemmer_c/libstemmer.a @USE_LIBSTEMMER_FALSE@AM_CPPFLAGS = -DSYSCONFDIR="\"$(sysconfdir)\"" @USE_LIBSTEMMER_TRUE@AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include -DSYSCONFDIR="\"$(sysconfdir)\"" -COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) +COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(MMSEG_LIBS) indexer_LDADD = $(COMMON_LIBS) searchd_LDADD = $(COMMON_LIBS) -lz search_LDADD = $(COMMON_LIBS) @@ -324,6 +328,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sphinxstemru.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sphinxutils.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tests.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tokenizer_zhcn.Po@am__quote@ .cpp.o: @am__fastdepCXX_TRUE@ if $(CXXCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \ diff -r e6b939051504 src/sphinx.cpp --- a/src/sphinx.cpp Tue Apr 01 12:51:18 2008 +0800 +++ b/src/sphinx.cpp Mon Apr 07 11:02:46 2008 +0800 @@ -92,8 +92,13 @@ #endif #if ( USE_WINDOWS && USE_LIBICONV ) - #pragma comment(linker, "/defaultlib:iconv.lib") + #pragma comment(linker, "/defaultlib:libiconv.lib") #pragma message("Automatically linking with iconv.lib") +#endif + +#if ( USE_WINDOWS && USE_MMSEG ) +#pragma comment(linker, "/defaultlib:libcss.lib") +#pragma message("Automatically linking with libcss.lib") #endif #if ( USE_WINDOWS && USE_LIBXML ) @@ -138,7 +143,7 @@ void sphAssert ( const char * sExpr, con // forward decl void sphWarn ( const char * sTemplate, ... ); int sphReadThrottled ( int iFD, void * pBuf, size_t iCount ); - +static inline bool IsSeparator ( int iFolded, bool bFirst ); ///////////////////////////////////////////////////////////////////////////// // GLOBALS ///////////////////////////////////////////////////////////////////////////// @@ -2069,77 +2074,6 @@ void sphSetInternalErrorCallback ( void #if USE_WINDOWS #pragma warning(disable:4127) // conditional expr is const for MSVC #endif -inline int sphUTF8Decode ( BYTE * & pBuf ); // forward ref for GCC -inline int sphUTF8Encode ( BYTE * pBuf, int iCode ); // forward ref for GCC - - -/// tokenizer implementation traits -template < bool IS_UTF8 > -class CSphTokenizerTraits : public ISphTokenizer -{ -public: - CSphTokenizerTraits (); - - virtual const BYTE * GetTokenStart () const; - virtual const BYTE * GetTokenEnd () const; - virtual const BYTE * GetBufferPtr () const; - virtual const BYTE * GetBufferEnd () const; - virtual void AdvanceBufferPtr ( int iOffset ); - -protected: - BYTE * GetTokenSyn (); - -protected: - /// get codepoint - inline int GetCodepoint () - { - if ( IS_UTF8 ) - { - while ( m_pCur=0 ) - return iCode; // succesful decode - } - return -1; // eof - } else - { - return m_pCur>=m_pBufferMax - ? -1 - : int(*m_pCur++); - } - } - - /// accum codepoint - inline void AccumCodepoint ( int iCode ) - { - assert ( iCode>0 ); - assert ( m_iAccum>=0 ); - - // throw away everything which is over the token size - if ( m_iAccum=m_sAccum && m_pAccumsetDictPath(dict_path); + return tokenizer; +} +#if USE_LIBICONV +/// create GBK tokenizer with Chinese Segment support +ISphTokenizer * sphCreateGBKChineseTokenizer (const char* dict_path ) +{ + CSphTokenizer_zh_CN_GBK* tokenizer = new CSphTokenizer_zh_CN_GBK (); + tokenizer->setDictPath(dict_path); + return tokenizer; +} +#endif + +#endif ///////////////////////////////////////////////////////////////////////////// CSphLowercaser::CSphLowercaser () @@ -2331,19 +2284,6 @@ void CSphLowercaser::AddRemaps ( const C } -enum -{ - MASK_CODEPOINT = 0x00ffffffUL, // mask off codepoint flags - MASK_FLAGS = 0xff000000UL, // mask off codepoint value - FLAG_CODEPOINT_SPECIAL = 0x01000000UL, // this codepoint is special - FLAG_CODEPOINT_DUAL = 0x02000000UL, // this codepoint is special but also a valid word part - FLAG_CODEPOINT_NGRAM = 0x04000000UL, // this codepoint is n-gram indexed - FLAG_CODEPOINT_SYNONYM = 0x08000000UL, // this codepoint is used in synonym tokens only - FLAG_CODEPOINT_BOUNDARY = 0x10000000UL, // this codepoint is phrase boundary - FLAG_CODEPOINT_IGNORE = 0x20000000UL // this codepoint is ignored -}; - - void CSphLowercaser::AddSpecials ( const char * sSpecials ) { assert ( sSpecials ); @@ -2625,82 +2565,6 @@ bool CSphCharsetDefinitionParser::Parse return true; } - -///////////////////////////////////////////////////////////////////////////// - -/// UTF-8 decode codepoint -/// advances buffer ptr in all cases but end of buffer -/// -/// returns -1 on failure -/// returns 0 on end of buffer -/// returns codepoint on success -inline int sphUTF8Decode ( BYTE * & pBuf ) -{ - BYTE v = *pBuf; - if ( !v ) - return 0; - pBuf++; - - // check for 7-bit case - if ( v<128 ) - return v; - - // get number of bytes - int iBytes = 0; - while ( v & 0x80 ) - { - iBytes++; - v <<= 1; - } - - // check for valid number of bytes - if ( iBytes<2 || iBytes>4 ) - return -1; - - int iCode = ( v>>iBytes ); - iBytes--; - do - { - if ( !(*pBuf) ) - return 0; // unexpected eof - - if ( ((*pBuf) & 0xC0)!=0x80 ) - return -1; // invalid code - - iCode = ( iCode<<6 ) + ( (*pBuf) & 0x3F ); - iBytes--; - pBuf++; - } while ( iBytes ); - - // all good - return iCode; -} - - -/// UTF-8 encode codepoint to buffer -/// returns number of bytes used -inline int sphUTF8Encode ( BYTE * pBuf, int iCode ) -{ - if ( iCode<0x80 ) - { - pBuf[0] = (BYTE)( iCode & 0x7F ); - return 1; - - } else if ( iCode<0x800 ) - { - pBuf[0] = (BYTE)( ( (iCode>>6) & 0x1F ) | 0xC0 ); - pBuf[1] = (BYTE)( ( iCode & 0x3F ) | 0x80 ); - return 2; - - } else - { - pBuf[0] = (BYTE)( ( (iCode>>12) & 0x0F ) | 0xE0 ); - pBuf[1] = (BYTE)( ( (iCode>>6) & 0x3F ) | 0x80 ); - pBuf[2] = (BYTE)( ( iCode & 0x3F ) | 0x80 ); - return 3; - } -} - int sphUTF8Len ( const char * pStr ) { @@ -3002,51 +2866,6 @@ bool ISphTokenizer::LoadSynonyms ( const } ////////////////////////////////////////////////////////////////////////// - -template < bool IS_UTF8 > -CSphTokenizerTraits::CSphTokenizerTraits () - : m_pBuffer ( NULL ) - , m_pBufferMax ( NULL ) - , m_pCur ( NULL ) - , m_pTokenStart ( NULL ) - , m_pTokenEnd ( NULL ) - , m_iAccum ( 0 ) -{ - m_pAccum = m_sAccum; -} - -template < bool IS_UTF8 > -const BYTE * CSphTokenizerTraits::GetTokenStart () const -{ - return m_pTokenStart; -} - -template < bool IS_UTF8 > -const BYTE * CSphTokenizerTraits::GetTokenEnd () const -{ - return m_pTokenEnd; -} - -template < bool IS_UTF8 > -const BYTE * CSphTokenizerTraits::GetBufferPtr () const -{ - return m_pCur; -} - -template < bool IS_UTF8 > -const BYTE * CSphTokenizerTraits::GetBufferEnd () const -{ - return m_pBufferMax; -} - -template < bool IS_UTF8 > -void CSphTokenizerTraits::AdvanceBufferPtr ( int iOffset ) -{ - assert ( iOffset >= 0 ); - m_pCur = Min ( m_pBufferMax, m_pCur + iOffset ); - m_iAccum = 0; - m_pAccum = m_sAccum; -} enum SynCheck_e @@ -6624,7 +6443,8 @@ int CSphIndex_VLN::Build ( CSphDict * pD // check for eof if ( !pSource->m_tDocInfo.m_iDocID ) break; - + //debug use only + //printf("docid:%d\t",pSource->m_tDocInfo.m_iDocID); // show progress bar if ( m_pProgress && ( ( pSource->GetStats().m_iTotalDocuments % 1000 )==0 ) ) diff -r e6b939051504 src/sphinx.h --- a/src/sphinx.h Tue Apr 01 12:51:18 2008 +0800 +++ b/src/sphinx.h Mon Apr 07 11:02:46 2008 +0800 @@ -18,10 +18,11 @@ #ifdef _WIN32 #define USE_MYSQL 1 /// whether to compile MySQL support - #define USE_LIBEXPAT 1 /// whether to compile libexpat support + #define USE_LIBEXPAT 0 /// whether to compile libexpat support #define USE_LIBICONV 1 /// whether to compile iconv support #define USE_LIBXML 0 /// whether to compile libxml support #define USE_WINDOWS 1 /// whether to compile for Windows + #define USE_MMSEG 1 /// whether to compile iconv support #else #define USE_WINDOWS 0 /// whether to compile for Windows #endif @@ -225,6 +226,18 @@ extern const char * SPHINX_DEFAULT_SBCS extern const char * SPHINX_DEFAULT_SBCS_TABLE; extern const char * SPHINX_DEFAULT_UTF8_TABLE; +enum +{ + MASK_CODEPOINT = 0x00ffffffUL, // mask off codepoint flags + MASK_FLAGS = 0xff000000UL, // mask off codepoint value + FLAG_CODEPOINT_SPECIAL = 0x01000000UL, // this codepoint is special + FLAG_CODEPOINT_DUAL = 0x02000000UL, // this codepoint is special but also a valid word part + FLAG_CODEPOINT_NGRAM = 0x04000000UL, // this codepoint is n-gram indexed + FLAG_CODEPOINT_SYNONYM = 0x08000000UL, // this codepoint is used in synonym tokens only + FLAG_CODEPOINT_BOUNDARY = 0x10000000UL, // this codepoint is phrase boundary + FLAG_CODEPOINT_IGNORE = 0x20000000UL // this codepoint is ignored +}; + ///////////////////////////////////////////////////////////////////////////// /// lowercaser remap range @@ -440,6 +453,198 @@ protected: CSphVector m_dSynEnd; ///< map 1st byte to candidate range end }; +inline int sphUTF8Decode ( BYTE * & pBuf ); // forward ref for GCC +inline int sphUTF8Encode ( BYTE * pBuf, int iCode ); // forward ref for GCC + +///////////////////////////////////////////////////////////////////////////// + +/// UTF-8 decode codepoint +/// advances buffer ptr in all cases but end of buffer +/// +/// returns -1 on failure +/// returns 0 on end of buffer +/// returns codepoint on success +inline int sphUTF8Decode ( BYTE * & pBuf ) +{ + BYTE v = *pBuf; + if ( !v ) + return 0; + pBuf++; + + // check for 7-bit case + if ( v<128 ) + return v; + + // get number of bytes + int iBytes = 0; + while ( v & 0x80 ) + { + iBytes++; + v <<= 1; + } + + // check for valid number of bytes + if ( iBytes<2 || iBytes>4 ) + return -1; + + int iCode = ( v>>iBytes ); + iBytes--; + do + { + if ( !(*pBuf) ) + return 0; // unexpected eof + + if ( ((*pBuf) & 0xC0)!=0x80 ) + return -1; // invalid code + + iCode = ( iCode<<6 ) + ( (*pBuf) & 0x3F ); + iBytes--; + pBuf++; + } while ( iBytes ); + + // all good + return iCode; +} + + +/// UTF-8 encode codepoint to buffer +/// returns number of bytes used +inline int sphUTF8Encode ( BYTE * pBuf, int iCode ) +{ + if ( iCode<0x80 ) + { + pBuf[0] = (BYTE)( iCode & 0x7F ); + return 1; + + } else if ( iCode<0x800 ) + { + pBuf[0] = (BYTE)( ( (iCode>>6) & 0x1F ) | 0xC0 ); + pBuf[1] = (BYTE)( ( iCode & 0x3F ) | 0x80 ); + return 2; + + } else + { + pBuf[0] = (BYTE)( ( (iCode>>12) & 0x0F ) | 0xE0 ); + pBuf[1] = (BYTE)( ( (iCode>>6) & 0x3F ) | 0x80 ); + pBuf[2] = (BYTE)( ( iCode & 0x3F ) | 0x80 ); + return 3; + } +} + + +/// tokenizer implementation traits +template < bool IS_UTF8 > +class CSphTokenizerTraits : public ISphTokenizer +{ +public: + CSphTokenizerTraits (); + + virtual const BYTE * GetTokenStart () const; + virtual const BYTE * GetTokenEnd () const; + virtual const BYTE * GetBufferPtr () const; + virtual const BYTE * GetBufferEnd () const; + virtual void AdvanceBufferPtr ( int iOffset ); + +protected: + BYTE * GetTokenSyn (); + +protected: + /// get codepoint + inline int GetCodepoint () + { + if ( IS_UTF8 ) + { + while ( m_pCur=0 ) + return iCode; // succesful decode + } + return -1; // eof + } else + { + return m_pCur>=m_pBufferMax + ? -1 + : int(*m_pCur++); + } + } + + /// accum codepoint + inline void AccumCodepoint ( int iCode ) + { + assert ( iCode>0 ); + assert ( m_iAccum>=0 ); + + // throw away everything which is over the token size + if ( m_iAccum=m_sAccum && m_pAccum +CSphTokenizerTraits::CSphTokenizerTraits () + : m_pBuffer ( NULL ) + , m_pBufferMax ( NULL ) + , m_pCur ( NULL ) + , m_pTokenStart ( NULL ) + , m_pTokenEnd ( NULL ) + , m_iAccum ( 0 ) +{ + m_pAccum = m_sAccum; +} + +template < bool IS_UTF8 > +const BYTE * CSphTokenizerTraits::GetTokenStart () const +{ + return m_pTokenStart; +} + +template < bool IS_UTF8 > +const BYTE * CSphTokenizerTraits::GetTokenEnd () const +{ + return m_pTokenEnd; +} + +template < bool IS_UTF8 > +const BYTE * CSphTokenizerTraits::GetBufferPtr () const +{ + return m_pCur; +} + +template < bool IS_UTF8 > +const BYTE * CSphTokenizerTraits::GetBufferEnd () const +{ + return m_pBufferMax; +} + +template < bool IS_UTF8 > +void CSphTokenizerTraits::AdvanceBufferPtr ( int iOffset ) +{ + assert ( iOffset >= 0 ); + m_pCur = Min ( m_pBufferMax, m_pCur + iOffset ); + m_iAccum = 0; + m_pAccum = m_sAccum; +} + /// create SBCS tokenizer ISphTokenizer * sphCreateSBCSTokenizer (); @@ -449,6 +654,10 @@ ISphTokenizer * sphCreateUTF8Tokenizer /// create UTF-8 tokenizer with n-grams support (for CJK n-gram indexing) ISphTokenizer * sphCreateUTF8NgramTokenizer (); +/// create UTF-8 tokenizer with Chinese Segment support +ISphTokenizer * sphCreateUTF8ChineseTokenizer ( const char* dict_path ); +/// create GBK tokenizer with Chinese Segment support +ISphTokenizer * sphCreateGBKChineseTokenizer ( const char* dict_path ); ///////////////////////////////////////////////////////////////////////////// // DICTIONARIES ///////////////////////////////////////////////////////////////////////////// diff -r e6b939051504 src/sphinxmetaphone.cpp --- a/src/sphinxmetaphone.cpp Tue Apr 01 12:51:18 2008 +0800 +++ b/src/sphinxmetaphone.cpp Mon Apr 07 11:02:46 2008 +0800 @@ -12,50 +12,6 @@ // #include "sphinx.h" - -// FIXME! merge this copy with the original -inline int sphUTF8Decode ( BYTE * & pBuf ) -{ - BYTE v = *pBuf; - if ( !v ) - return 0; - pBuf++; - - // check for 7-bit case - if ( v<128 ) - return v; - - // get number of bytes - int iBytes = 0; - while ( v & 0x80 ) - { - iBytes++; - v <<= 1; - } - - // check for valid number of bytes - if ( iBytes<2 || iBytes>4 ) - return -1; - - int iCode = ( v>>iBytes ); - iBytes--; - do - { - if ( !(*pBuf) ) - return 0; // unexpected eof - - if ( ((*pBuf) & 0xC0)!=0x80 ) - return -1; // invalid code - - iCode = ( iCode<<6 ) + ( (*pBuf) & 0x3F ); - iBytes--; - pBuf++; - } while ( iBytes ); - - // all good - return iCode; -} - struct CurrentWord_t { diff -r e6b939051504 src/sphinxutils.cpp --- a/src/sphinxutils.cpp Tue Apr 01 12:51:18 2008 +0800 +++ b/src/sphinxutils.cpp Mon Apr 07 11:02:46 2008 +0800 @@ -127,6 +127,7 @@ static KeyDesc_t g_dKeysIndex[] = { "min_word_len", 0, NULL }, { "charset_type", 0, NULL }, { "charset_table", 0, NULL }, + { "charset_dictpath", 0, NULL }, { "ignore_chars", 0, NULL }, { "min_prefix_len", 0, NULL }, { "min_infix_len", 0, NULL }, @@ -681,7 +682,23 @@ ISphTokenizer * sphConfTokenizer ( const ? sphCreateUTF8NgramTokenizer () : sphCreateUTF8Tokenizer (); - } else + } +#if USE_MMSEG + else if (hIndex("charset_dictpath") && hIndex["charset_type"]=="zh_cn.utf-8" ) + { + //LMN: TODO append GBKTokenizer here. + pTokenizer = sphCreateUTF8ChineseTokenizer(hIndex["charset_dictpath"].cstr()); + + } +#if USE_LIBICONV + else if ( hIndex("charset_dictpath") &&hIndex["charset_type"]=="zh_cn.gbk") + { + pTokenizer = sphCreateGBKChineseTokenizer(hIndex["charset_dictpath"].cstr()); + } +#endif + +#endif + else { sError.SetSprintf ( "unknown charset type '%s'", hIndex["charset_type"].cstr() ); return NULL; diff -r e6b939051504 src/tokenizer_zhcn.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/tokenizer_zhcn.cpp Mon Apr 07 11:02:46 2008 +0800 @@ -0,0 +1,293 @@ +#include "SegmenterManager.h" +#include "Segmenter.h" + +#include "sphinx.h" +#include "tokenizer_zhcn.h" +#if USE_LIBICONV +#include +#endif + +#if USE_WINDOWS +#define ICONV_INBUF_CONST 1 +#endif + +//////////////////////////////////////////////////////////// +typedef CSR_Singleton SegmenterManagerSingleInstance; + +class CSphTokenizer_zh_CN_UTF8_Private +{ +public: + CSphTokenizer_zh_CN_UTF8_Private() + :m_seg(NULL), m_mgr(NULL) { +#if USE_LIBICONV + m_iconv = NULL; +#endif + if(!m_lower) + m_lower = css::ToLower::Get(); + if(!m_tagger) + m_tagger = css::ChineseCharTagger::Get(); + } + + ~CSphTokenizer_zh_CN_UTF8_Private() { +#if USE_LIBICONV + if(m_iconv) + iconv_close(m_iconv); + m_iconv = NULL; +#endif + }; + + css::Segmenter* GetSegmenter(const char* dict_path) { + int nRet = 0; + if(!m_mgr) { + m_mgr = SegmenterManagerSingleInstance::Get(); + if(dict_path) + nRet = m_mgr->init(dict_path); + } + if(nRet == 0 && !m_seg) + m_seg = m_mgr->getSegmenter(); + return m_seg; + } +#if USE_LIBICONV + iconv_t GetConverter(const char* from, const char* to) { + if(m_iconv) + return m_iconv; + //m_iconv = iconv_open ("UTF-8//IGNORE", "GB18030"); + m_iconv = iconv_open (to, from); + if (m_iconv == (iconv_t) -1) //error check. + return (iconv_t)(-1); + iconv(m_iconv, NULL, NULL, NULL, NULL); +#if 0 + //ignore invalid char-seq + int one = 1; + iconvctl(m_iconv, ICONV_SET_DISCARD_ILSEQ, &one); +#endif + return m_iconv; + } +#endif + +public: + static css::ToLowerImpl* m_lower; + static css::ChineseCharTaggerImpl* m_tagger; +protected: + css::Segmenter* m_seg; + css::SegmenterManager* m_mgr; +#if USE_LIBICONV + iconv_t m_iconv; +#endif +}; + +css::ToLowerImpl* CSphTokenizer_zh_CN_UTF8_Private::m_lower = NULL; +css::ChineseCharTaggerImpl* CSphTokenizer_zh_CN_UTF8_Private::m_tagger = NULL; + +//////////////////////////////////////////////////////////// +CSphTokenizer_zh_CN_UTF8::CSphTokenizer_zh_CN_UTF8 () +{ + m_dictpath = NULL; + d_ = new CSphTokenizer_zh_CN_UTF8_Private(); + m_tLC.Reset(); +} + +CSphTokenizer_zh_CN_UTF8::~CSphTokenizer_zh_CN_UTF8 () +{ + delete d_; +} + +void CSphTokenizer_zh_CN_UTF8::SetBuffer ( BYTE * sBuffer, int iLength ) +{ + // check that old one is over and that new length is sane + assert ( iLength>=0 ); + + // set buffer + m_pBuffer = sBuffer; + m_pBufferMax = sBuffer + iLength; + m_pCur = sBuffer; + + // fixup embedded zeroes with spaces + for ( BYTE * p = m_pBuffer; p < m_pBufferMax; p++ ) + if ( !*p ) + *p = ' '; + + m_iOvershortCount = 0; + //real set buffer. + css::Segmenter* seg = d_->GetSegmenter(m_dictpath); + seg->setBuffer((u1*)m_pBuffer,m_pBufferMax - m_pBuffer); + //be ready for getToekn + FlushAccum (); +} + +void CSphTokenizer_zh_CN_UTF8::FlushAccum () +{ + assert ( m_pAccum-m_sAccum < (int)sizeof(m_sAccum) ); + m_iLastTokenLen = m_iAccum; + *m_pAccum = 0; + m_iAccum = 0; + m_pAccum = m_sAccum; +} + +BYTE * CSphTokenizer_zh_CN_UTF8::GetToken () +{ + m_bWasSpecial = false; + css::Segmenter* seg = d_->GetSegmenter(m_dictpath); //TODO fill blank here + //seg->setBuffer((u1*)m_pBuffer,m_pBufferMax - m_pBuffer); + u2 len = 0, symlen = 0; + const char* tok = NULL; + + do { + tok = (const char*)seg->peekToken(len, symlen); + seg->popToken(len); + }while(tok && tok<(const char*)m_pBufferMax &&tok<(const char*)m_pCur); + + if(len > 3*SPH_MAX_WORD_LEN) + len = 3*SPH_MAX_WORD_LEN; //this might cause a wrong token. no token can larger than 64-char + //FIXME: NO Memcpy, 1. ToLower 2. Exceptions[done in libmmseg] 3. eat blanks + //to Lower + BYTE* tok_ptr = (BYTE*)tok; + BYTE* tok_max = tok_ptr + symlen; + while ( tok_ptr=0 ){ + iCode = (int)d_->m_lower->toLower((u2)iCode); + int iFolded = m_tLC.ToLower ( iCode ); + if ( ( iFolded & FLAG_CODEPOINT_SPECIAL ) && m_iAccum==0 ) + { + m_bWasSpecial = true; + + AccumCodepoint ( iFolded & MASK_CODEPOINT ); + *m_pAccum = '\0'; + + //m_pTokenStart = tok_ptr; + //m_pTokenEnd = m_pCur; + m_pCur = tok_ptr; + m_iLastTokenLen = 1; + FlushAccum (); + + return m_sAccum; + } + //check is sep, no sep can be passed to user + //if(IsSeparator (iFolded, m_iAccum==0)) + u2 tag = d_->m_tagger->tagUnicode(iCode,1); + tag = (tag&0x3F) + 'a' -1; + if(tag == 'w' || iCode == ' ') + { + //skip the 1st char + } + else + { + if ( m_iAccum == 0 ) + m_pTokenStart = (BYTE*)tok; + + AccumCodepoint ( iCode ); + } + //AccumCodepoint ( iCode ); //will cut token is larger than SPH_MAX_WORD_LEN + }else + break; + } + //?? m_pCur = tok_ptr; + FlushAccum (); + + //memcpy(m_sAccum, tok, len); + //m_sAccum[symlen] = 0; + //printf("%*.*s/x ",symlen,symlen,tok); + if(m_sAccum[0] == '\r' || m_sAccum[0] == '\n') { + //m_sAccum[0] = 0; + return GetToken(); + } + //printf("%s,",m_sAccum); + if(symlen) + return m_sAccum; + else{ + //printf("\n"); + return NULL; + } +} + + +ISphTokenizer * CSphTokenizer_zh_CN_UTF8::Clone ( bool bEscaped ) const +{ + CSphTokenizer_zh_CN_UTF8 * pClone = new CSphTokenizer_zh_CN_UTF8 (); + pClone->CloneBase ( this, bEscaped ); + pClone->m_dictpath = m_dictpath; + return pClone; +} + + +int CSphTokenizer_zh_CN_UTF8::GetCodepointLength ( int iCode ) const +{ + if ( iCode<128 ) + return 1; + + int iBytes = 0; + while ( iCode & 0x80 ) + { + iBytes++; + iCode <<= 1; + } + + assert ( iBytes>=2 && iBytes<=4 ); + return iBytes; +} + +#if USE_LIBICONV + +CSphTokenizer_zh_CN_GBK::CSphTokenizer_zh_CN_GBK () + :m_convert_buffer(NULL) +{ + m_convert_buffer = m_default_convert_buffer; + m_buffer_size = GBK_CONVERT_BUFFER_SIZE; +} + +CSphTokenizer_zh_CN_GBK::~CSphTokenizer_zh_CN_GBK() +{ + if(m_convert_buffer != m_default_convert_buffer) + free(m_convert_buffer); + m_convert_buffer = NULL; +} + +void CSphTokenizer_zh_CN_GBK::SetBuffer( BYTE * sBuffer, int iLength ) +{ + //check convert buffer + int tLength = (int)(iLength*1.5+3); + if(tLength > m_buffer_size){ + if(m_convert_buffer != m_default_convert_buffer) + free(m_convert_buffer); + m_convert_buffer = (BYTE*)malloc(tLength); + m_buffer_size = tLength; + } + //convert + iconv_t it = d_->GetConverter("GB18030", "UTF-8//IGNORE"); + +#if ICONV_INBUF_CONST + const char * ptr = (char*)sBuffer; +#else + char * ptr = (char*)sBuffer; +#endif + + char * target_ptr = (char*)m_convert_buffer; + size_t inbytes_remaining = (size_t)iLength; + size_t outbytes_remaining = (size_t)m_buffer_size; + int err = iconv (it, &ptr, &inbytes_remaining, &target_ptr, &outbytes_remaining); + if(err>=0) + *target_ptr = 0; //end the char + //call base::setbuffer + CSphTokenizer_zh_CN_UTF8::SetBuffer(m_convert_buffer, m_buffer_size - outbytes_remaining); +} + +/* +BYTE * CSphTokenizer_zh_CN_GBK::GetToken () +{ + return NULL; +} +*/ + + +ISphTokenizer * CSphTokenizer_zh_CN_GBK::Clone ( bool bEscaped ) const +{ + CSphTokenizer_zh_CN_GBK * pClone = new CSphTokenizer_zh_CN_GBK (); + pClone->CloneBase ( this, bEscaped ); + pClone->m_dictpath = m_dictpath; + return pClone; +} + +#endif + diff -r e6b939051504 src/tokenizer_zhcn.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/tokenizer_zhcn.h Mon Apr 07 11:02:46 2008 +0800 @@ -0,0 +1,57 @@ +#ifndef _TOKENIZER_ZHCN_H_ +#define _TOKENIZER_ZHCN_H_ + +/* +#ifndef BYTE +#define BYTE char +#define SPH_MAX_WORD_LEN 64 +#endif +*/ + +class CSphTokenizer_zh_CN_UTF8_Private; + +class CSphTokenizer_zh_CN_UTF8: public CSphTokenizerTraits +{ +public: + CSphTokenizer_zh_CN_UTF8 (); + ~CSphTokenizer_zh_CN_UTF8(); + + virtual void SetBuffer ( BYTE * sBuffer, int iLength ); + virtual BYTE * GetToken (); + virtual ISphTokenizer * Clone ( bool bEscaped ) const; + virtual bool IsUtf8 () const { return true; } + virtual int GetCodepointLength ( int iCode ) const; + void setDictPath(const char* path) { m_dictpath = path; } +//protected: + //BYTE * m_pBuffer; ///< my buffer + //BYTE * m_pBufferMax; ///< max buffer ptr, exclusive (ie. this ptr is invalid, but every ptr below is ok) + //BYTE * m_pCur; ///< current position + //int m_iOvershortCount; ///< skipped overshort tokens count + //BYTE m_sAccum [ 3*SPH_MAX_WORD_LEN+3 ]; +protected: + CSphTokenizer_zh_CN_UTF8_Private* d_; + const char* m_dictpath; + void FlushAccum (); +}; + +#if USE_LIBICONV + +#define GBK_CONVERT_BUFFER_SIZE 512*1024*3 +class CSphTokenizer_zh_CN_GBK : public CSphTokenizer_zh_CN_UTF8 +{ +public: + CSphTokenizer_zh_CN_GBK (); + ~CSphTokenizer_zh_CN_GBK(); + virtual void SetBuffer ( BYTE * sBuffer, int iLength ); + //virtual BYTE * GetToken (); + virtual ISphTokenizer * Clone ( bool bEscaped ) const; +protected: + BYTE m_default_convert_buffer[GBK_CONVERT_BUFFER_SIZE]; + BYTE* m_convert_buffer; + int m_buffer_size; +}; + +#endif + +#endif + diff -r e6b939051504 win/indexer05.vcproj --- a/win/indexer05.vcproj Tue Apr 01 12:51:18 2008 +0800 +++ b/win/indexer05.vcproj Mon Apr 07 11:02:46 2008 +0800 @@ -1,209 +1,209 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r e6b939051504 win/libsphinx05.vcproj --- a/win/libsphinx05.vcproj Tue Apr 01 12:51:18 2008 +0800 +++ b/win/libsphinx05.vcproj Mon Apr 07 11:02:46 2008 +0800 @@ -1,268 +1,272 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r e6b939051504 win/search05.vcproj --- a/win/search05.vcproj Tue Apr 01 12:51:18 2008 +0800 +++ b/win/search05.vcproj Mon Apr 07 11:02:46 2008 +0800 @@ -1,207 +1,207 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r e6b939051504 win/searchd05.vcproj --- a/win/searchd05.vcproj Tue Apr 01 12:51:18 2008 +0800 +++ b/win/searchd05.vcproj Mon Apr 07 11:02:46 2008 +0800 @@ -1,209 +1,209 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r e6b939051504 win/spelldump05.vcproj --- a/win/spelldump05.vcproj Tue Apr 01 12:51:18 2008 +0800 +++ b/win/spelldump05.vcproj Mon Apr 07 11:02:46 2008 +0800 @@ -1,207 +1,207 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r e6b939051504 win/tests05.vcproj --- a/win/tests05.vcproj Tue Apr 01 12:51:18 2008 +0800 +++ b/win/tests05.vcproj Mon Apr 07 11:02:46 2008 +0800 @@ -1,207 +1,207 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +