/* 
   Program to read in the text thesaurus and 
   sorted wordlist and then build a binary 
   data file and text index file which 
   stores the thesaurus and allow fast
   access to each entry.

   The syntax for invoking this command is:
   thencode path_to_input_directory path_to_output_directory


   The input:

   The format of each line of the input text 
   thesaurus file is simply:

   <word>,<syn1>,<syn2>,...<synn>\n

   The text wordlist is simply a sorted verion
   of each of the entry words in the thesaurus
   one per line terminating with \n


   The output:

   The packed binary data file format is:

      <ns><s1><s2><s3>...<sn>

   where ns is the number of synonyms (sal_u16)
   and s1,...,sn are the array indexes (sal_u16) 
   into the index file.  All data are stored in
   Big Endian format.  Endian conversion is done
   when necessary.  Note, because sal_uInt16 is
   used, the maximum number of word entries  in
   the thesaurus is limited to 65535.

   The index file is a sorted plain text file ready 
   for loading.  Each line in the file is
   as follows:
   
   <word>,<offset>\n

    where <word> is text (8 bit) string, and <offset>
    is the offset into the binary data file where
    the synonyms for this word may be found 
*/


#include <sal/types.h>
#include <osl/file.h>
#include <osl/file.hxx>
#include <osl/thread.h>
#include <osl/process.h>
#include <rtl/alloc.h>
#include <rtl/string.hxx>
#include <rtl/strbuf.hxx>
#include <rtl/ustring.hxx>
#include <tools/urlobj.hxx>

#if defined(WIN) || defined(WNT)
#undef __LITTLEENDIAN
#define __LITTLEENDIAN 1
#else 
#include <tools/svconf.h>
#endif

#include <stdio.h>
#include <string.h>


using namespace rtl;
using namespace osl;

#include "thencode.hxx"

#define READERBUFSZ 8192

class BufferedReader
{
    public:
        BufferedReader(File * pf);
        ~BufferedReader();
        sal_Int32 readCharacter(sal_Char* pc);
        sal_Int32 readLine(sal_Char* pBuffer, sal_uInt64 nc);

    private:
        File* m_pFile;
        sal_Char* m_pBuffer;
        sal_Char* m_pEnd;
        sal_Char* m_pCur;
};

BufferedReader::BufferedReader(File *pf) : m_pFile(pf)
{
    m_pBuffer = new sal_Char[READERBUFSZ];
    m_pCur = m_pEnd = m_pBuffer;
}

BufferedReader::~BufferedReader()
{
    delete[] m_pBuffer;
    m_pBuffer = 0;
}

sal_Int32 BufferedReader::readCharacter(sal_Char *pc)
{
    FileBase::RC rc;
    sal_uInt64 nr;

    if ( m_pCur == m_pEnd ) {
       rc = m_pFile->read(m_pBuffer, READERBUFSZ, nr);
       if ( rc != FileBase::E_None )
           return -1;
       if ( nr == 0 )
           return 0;
       m_pEnd = m_pBuffer + nr;
       m_pCur = m_pBuffer;
    }
    *pc = *m_pCur++;
    return 1;
}

/* 
  read a line of text from a text file stripping
  off the line terminator and replacing it with
  a null string terminator.

  returns:  -1 on error or the number of characters in
             in the returning string

  A maximum of nc characters will be returned
*/
sal_Int32 BufferedReader::readLine(sal_Char *pBuffer, sal_uInt64 nc)
{
    *pBuffer = 0;
    sal_uInt64 nb = 0;
    do {
        int rc;
        rc = this->readCharacter(pBuffer+nb);
        if ( rc == -1 ) return -1; 
        if ( rc  == 0 ) {
           *(pBuffer+nb)= '\0';
           return (sal_Int32) nb; 
        }
        nb++;
        if (nb == (sal_uInt64)(nc - 1)) {
	        *(pBuffer+nb) = '\0';
        return (sal_Int32) nb; 
        }
    } while ( *(pBuffer+(nb-1)) != '\n' );
    *(pBuffer+(nb-1)) = '\0';
    return (sal_Int32) nb;
}


int 
main(sal_Int32 argc, sal_Char** argv)
{

  sal_Int32  j, m;
  sal_Int32  nw;                  /* number of entries in thesaurus */
  sal_Char**  list;               /* stores word list */
  sal_uInt32* offst;              /* stores offset list */
  sal_uInt64 offset;              /* current offset into data file */
  sal_uInt16* ilst;               /* synonym index lst */
  sal_uInt16 ns;                  /* number of synonyms */
  sal_uInt64 bl;

  sal_Int32 nIndex;
  sal_Char cTok;
  sal_Int32 len=0;

  sal_Char *op, *en;
  sal_Int32 idx; 
  FileBase::RC rc;

  if ((!(argv[1])) || (!(argv[2])) || (!(argv[3]))) {
       fprintf(stderr,"thencode input_directory output_directory output_file\n");
       exit(1);
  }


  rtl_TextEncoding aEncoding = osl_getThreadTextEncoding();
  OUString aUniInp( argv[1], strlen( argv[1] ), aEncoding );
  OUString aUniOut( argv[2], strlen( argv[2] ), aEncoding );
  OUString aUniName(argv[3], strlen( argv[3] ), aEncoding );

  OUString iUrl;
  OUString aiUrl;
  File::getFileURLFromSystemPath( aUniInp, iUrl );

  OUString oUrl;
  OUString aoUrl;
  File::getFileURLFromSystemPath( aUniOut, oUrl );

  OUString aWorkingDirURL;

  osl_getProcessWorkingDir( &aWorkingDirURL.pData );
  File::getAbsoluteFileURL( aWorkingDirURL, iUrl, aiUrl );
  File::getAbsoluteFileURL( aWorkingDirURL, oUrl, aoUrl );

  /* read in the sorted word list and store way for bin search */
  OUString wpath;
  if( aiUrl.lastIndexOf( '/' ) != aiUrl.getLength()-1 )
      wpath = aiUrl + OUString( RTL_CONSTASCII_USTRINGPARAM("/" )) + aUniName
                    + OUString( RTL_CONSTASCII_USTRINGPARAM("_words.txt" )); 

  else
      wpath = aiUrl + aUniName + OUString( RTL_CONSTASCII_USTRINGPARAM("_words.txt"));
  File *pwdlst = new File(wpath);
  rc = pwdlst->open(OpenFlag_Read);
  if (rc != FileBase::E_None) {
     fprintf(stderr,"Error - error opening wordlist -  %d\n",rc);
     fprintf(stderr,"wordlist path is %s\n",OU2A(wpath));
     exit(1);
  } 

  sal_Char * wrd;
  wrd = (sal_Char *)rtl_allocateMemory(MAX_WD_LEN);

  // allocate the arrays 
  list = (sal_Char**) rtl_allocateMemory(MAX_WDS*sizeof(sal_Char*));
  offst = (sal_uInt32*) rtl_allocateMemory(MAX_WDS*sizeof(sal_uInt32));
  ilst = (sal_uInt16*) rtl_allocateMemory(MAX_WDS*sizeof(sal_uInt16));

  if ((!list) || (!(offst)) || (!(ilst))) {
    if (list)  rtl_freeMemory((void*)list);
    if (offst) rtl_freeMemory((void*)offst);
    if (ilst)  rtl_freeMemory((void*)ilst);
    fprintf(stderr,"Error - memory allocation error\n");
    exit(1);
  }   

  nw = 0;
  BufferedReader aPwdLst(pwdlst);
  len = aPwdLst.readLine(wrd,MAX_WD_LEN);

  while (len > 0)
  { 
    list[nw] = (sal_Char *)rtl_allocateMemory(len+1);
    rtl_copyMemory((list[nw]),wrd,len+1);
    nw++;
    len = aPwdLst.readLine(wrd,MAX_WD_LEN);
  }

  rtl_freeMemory((void *)wrd);
  pwdlst->close();
  delete(pwdlst);

  /* open the thesaurus final data file */
  OUString dpath;
  if( aoUrl.lastIndexOf( '/' ) != aoUrl.getLength()-1 )
      dpath = aoUrl + OUString (RTL_CONSTASCII_USTRINGPARAM("/")) + aUniName 
                    + OUString( RTL_CONSTASCII_USTRINGPARAM(".dat" )); 
  else
      dpath = aoUrl + aUniName + OUString( RTL_CONSTASCII_USTRINGPARAM(".dat"));

  File * pdfile = new File(dpath);
  rc = pdfile->open(OpenFlag_Read | OpenFlag_Write | OpenFlag_Create);
  if (rc == FileBase::E_EXIST) {
    rc =  pdfile->open(OpenFlag_Read | OpenFlag_Write);
  }
  if (rc != FileBase::E_None) {
     fprintf(stderr,"Error - error opening binary data file -  %d\n",rc);
     fprintf(stderr,"data file path is %s\n",OU2A(dpath));
     if (list)  rtl_freeMemory((void*)list);
     if (offst) rtl_freeMemory((void*)offst);
     if (ilst)  rtl_freeMemory((void*)ilst);
     exit(1);
  } 

  /* open the thesaurus input text file */
  OUString tpath;
  if( aiUrl.lastIndexOf( '/' ) != aiUrl.getLength()-1 )
      tpath = aiUrl + OUString( RTL_CONSTASCII_USTRINGPARAM("/")) + aUniName
                    +  OUString( RTL_CONSTASCII_USTRINGPARAM("_thes.txt" )); 
  else
      tpath = aiUrl + aUniName + OUString( RTL_CONSTASCII_USTRINGPARAM("_thes.txt"));
  File * ptfile = new File (tpath);
  rc = ptfile->open(OpenFlag_Read);
  if (rc != FileBase::E_None) {
     fprintf(stderr,"Error - error thesaurus text file  -  %d\n",rc);
     if (list)  rtl_freeMemory((void*)list);
     if (offst) rtl_freeMemory((void*)offst);
     if (ilst)  rtl_freeMemory((void*)ilst);
     exit(1);
  } 

  /* some io buffers */
  sal_Char * ibuf;
  ibuf = (sal_Char *)rtl_allocateMemory((sal_Int32)MAX_LN_LEN);
  sal_Char * obuf;
  obuf  = (sal_Char *)rtl_allocateMemory((sal_Int32)MAX_LN_LEN);


  /* now read each entry and map from words to indexes 
     into the word list and thereby create the binary
     thesaurus data file 
  */
  offset = 0;
  BufferedReader aTFile(ptfile);
  len = aTFile.readLine(ibuf,MAX_LN_LEN);
  while (len > 0) 
  {
    {
        OString tline = OString((sal_Char *)ibuf);

     /* parse the line */

        /* strip away the intial word */
        nIndex = 0;
        cTok = ',';
        OString entry = tline.getToken(0, cTok, nIndex);
        // fprintf(stdout,"...processing %s\n",entry.getStr());
        idx = binsearch((sal_Char *)entry.getStr(),list,nw);
        if (idx < 0) {
           fprintf(stderr,"Error - wordlist does not match data for %s\n",entry.getStr());
           if (list)  rtl_freeMemory((void*)list);
           if (offst) rtl_freeMemory((void*)offst);
           if (ilst)  rtl_freeMemory((void*)ilst);
           exit(1);
        }

        /* loop through the synonyms storing away indexes */
        offst[idx] = 0;
        ns = 0;
        do {
            OString nentry = tline.getToken(0, cTok, nIndex);
            m = binsearch((sal_Char *)nentry.getStr(),list,nw);
            if (m != -1) {
	       ilst[ns] = (sal_uInt16) m;
               ns++;
            } else {
               fprintf(stderr,"Error - can't find synonym %s in list\n",entry.getStr());
               fprintf(stderr,"Error - entry was  %s\n",nentry.getStr());
            }
        } while(nIndex >= 0);
 
     /* line now parsed, so store away encoded data */

     if (ns > 0) {
       bl = (size_t)((ns + 1) * sizeof(sal_uInt16));

       op = obuf;
       en = op;
       end_cpy(op,(sal_Char *) &ns,1);
       op = op + sizeof(sal_uInt16);
       end_cpy(op,(sal_Char *)ilst,ns);

       if (pdfile->setPos(Pos_Absolut,offset) != FileBase::E_None) {
	 fprintf(stderr,"Error - Seek Failed\n");
         if (list)  rtl_freeMemory((void*)list);
         if (offst) rtl_freeMemory((void*)offst);
         if (ilst)  rtl_freeMemory((void*)ilst);
         exit(1);
       }
       
       rc = writenc(pdfile, en , bl);
       if (rc !=  FileBase::E_None) {
	 fprintf(stderr,"Error - write returned an error %d\n",rc);
         if (list)  rtl_freeMemory((void*)list);
         if (offst) rtl_freeMemory((void*)offst);
         if (ilst)  rtl_freeMemory((void*)ilst);
         exit(1);
       }
       offst[idx] = (sal_Int32)offset;
       offset = offset + bl;
     }
    }
     len = aTFile.readLine(ibuf,MAX_LN_LEN);
   }
   /* finished writing data file */
   rtl_freeMemory((void *)ibuf);
   ptfile->close();
   delete(ptfile);
   pdfile->close();
   delete(pdfile);

   /* now must write out index file */
  OUString ipath;
  if( aoUrl.lastIndexOf( '/' ) != aoUrl.getLength()-1 )
      ipath = aoUrl + OUString( RTL_CONSTASCII_USTRINGPARAM("/" )) + aUniName + 
                      OUString( RTL_CONSTASCII_USTRINGPARAM(".idx" )); 
  else
      ipath = aoUrl + aUniName + OUString( RTL_CONSTASCII_USTRINGPARAM(".idx"));
   File* pifile = new File(ipath);
   rc = pifile->open(OpenFlag_Read | OpenFlag_Write | OpenFlag_Create);
   if (rc == FileBase::E_EXIST) {
     rc =  pifile->open(OpenFlag_Read | OpenFlag_Write);
   }
   if (rc != FileBase::E_None) {
      fprintf(stderr,"Error - error opening index file -  %d\n",rc);
      if (list)  rtl_freeMemory((void*)list);
      if (offst) rtl_freeMemory((void*)offst);
      if (ilst)  rtl_freeMemory((void*)ilst);
      exit(1);
   } 
   op = obuf;
   for (j=0; j<nw; j++) {
      bl = sprintf(op,"%s,%d\n",list[j],offst[j]);
      rc = writenc(pifile, obuf, bl);
      if (rc !=  FileBase::E_None) {
	fprintf(stderr,"Error - write returned an error %d\n",rc);
        if (list)  rtl_freeMemory((void*)list);
        if (offst) rtl_freeMemory((void*)offst);
        if (ilst)  rtl_freeMemory((void*)ilst);
        exit(1);
      }
   }
   rtl_freeMemory((void *)obuf);
   for (j = 0; j<nw; j++) {
     rtl_freeMemory(list[j]);
   }
   if (list)  rtl_freeMemory((void*)list);
   if (offst) rtl_freeMemory((void*)offst);
   if (ilst)  rtl_freeMemory((void*)ilst);
   pifile->close();
   delete(pifile);
   
   return 0;
}


/*
  writes a fixed number of bytes to a File making sure
  the complete write occurs or an error is returned
*/
FileBase::RC writenc(File * pf, sal_Char * buf, sal_uInt64 nc)
{
  FileBase::RC rc;
  sal_uInt64 bw, nb;


  rc = pf->write((void *) buf, nc, bw);
  if (rc !=  FileBase::E_None) return rc;
  while (bw < nc) {
     rc = pf->write((void *) (buf+bw), (nc - bw), nb);
     if (rc != FileBase::E_None) return rc;
     bw = bw + nb;
  }
  return FileBase::E_None;
}


/* 
  performs a binary search on null terminated character
  strings

  returns: -1 on not found
           index of wrd in the list[]
*/
sal_Int32 binsearch(sal_Char * sw, sal_Char* list[], int nlst) 
{
  sal_Int32 lp, up, mp, j, indx;
  lp = 0;
  up = nlst-1;
  indx = -1;
  if (rtl_str_compare(sw,list[lp]) < 0) return -1;
  if (rtl_str_compare(sw,list[up]) > 0) return -1;
  while (indx < 0 ) {
    mp = (sal_Int32)((lp+up) >> 1);
    j = rtl_str_compare(sw,list[mp]);
    if ( j > 0) {
       lp = mp + 1;
    } else if (j < 0 ) {
       up = mp - 1;
    } else {
       indx = mp;
    }
    if (lp > up) return -1;      
  }
  return indx;
}


/*
   performs a special memcpy of a string of unsigned shorts
   which does endian swapping on little endian systems to
   create big endian versions of the unsigned short data
*/
void end_cpy(sal_Char * q, sal_Char * p, sal_Int32 num_u16)
{
#if defined(__LITTLEENDIAN)
  sal_Char * s;
  sal_Char * t;
  sal_Int32 i;
  s = p + 1;
  t = q + 1;
  for (i=0;i<num_u16;i++) {
    *q = *s; *t = *p;
    p+=2; q+=2; s+=2; t+=2;
  }
#else 
  rtl_copyMemory(q,p,(num_u16 * sizeof(sal_uInt16)));
#endif
}


