/*--------------------------------------------------------------*
  sang  ---  Suffix Array Ѥ N-gram פȤץ

  USAGE   sang -n NUM -t NUM FILENAME
  OPTION
  -n NUM : NUM  n-gram  n ꤹ롣
  -t NUM : threshold: NUMʲ٤ΤΤɽʤ

  n-gram ˤϲԤϴޤޤʤ

[¹]
> cat test
ABCBACABBAACABCABCACABACABBACBACACAAABACCAB
> makeary -q test                  arrayեκ
> sang -n 6 -t 1 test              6-gram ٤ 1 礭Τɽ
2 ACABBA
2 BACABB
> sang -n 3 -t 4 test              trigram ٤ 4 礭Τɽ
6 ACA
5 BAC
6 CAB

  971029  Version 0.1  ãͺ(tatuo-y@is,aist-nara.ac.jp)
  980327  Version 0.2  NEW sa_sel() Τν, BUGFIX:ǸN-gram̤ɽ
 *--------------------------------------------------------------*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "sufary.h"

static void usage(void);
void do_sang(char *fname, int ng, int threshold);
void hyouji(char *nstr, int n_ctr, int threshold);

char nstr[1000]; /* n-gram Ե */

/**********************************************
  ᥤᥤᥤᥤᥤᥤᥤ
 **********************************************/
int main(int argc, char *argv[])
{
  int ng; /* n-gram  n */
  int threshold = 0; /* thresold */
  
  /******  ******/
  if(argc <= 1){
    usage();
    exit(1);
  }
  while (argc > 1){
    /* ץ */
    if (argv[1][0] == '-')
      switch (argv[1][1]){
	/* n-gramn */
      case 'n':
        ng = atoi(argv[2]);
        argc--; argv++;
        break;
	/* threshold: ʲ٤ΤΤɽʤ */
      case 't': 
        threshold = atoi(argv[2]);
        argc--; argv++;
        break;
	/* 顼 */
      default : 
	usage();
	exit(1);
      }
    else{
      /* n-gramץ롼Ƥ */
      do_sang(argv[1], ng, threshold);
    }
    /* Ĥΰ */
    argc--; argv++;
  }
  /* λ */
  return 0;
}

/**********************************************
 * void do_sang(char *fname, int ng, int threshold)
 * 
 * purpose
 *   n-gramפȤ
 *
 * parameters
 *   fname : ե̾
 *   ng : -gram
 *   threshold : ɽ(ʲ٤ΤΤɽ)
 *
 * return value
 *   ʤ
 **********************************************/
void do_sang(char *fname, int ng, int threshold)
{
  char *s;
  long tmp;
  int n_ctr = 0;
  SUFARY *ary;
  char *p;

  /****** ƥ/쥤ե򳫤 ******/
  if ((ary = sa_openfiles(fname,NULL)) == NULL){
    printf("argument ignored.\n");
    usage();
    exit(1);
  }

  nstr[0] = '\0';

/* printf("%ld %ld\n",sa_bottom(ary),sa_top(ary)); */

  /* aryƤǤФƥ롼 */
  for (tmp = sa_bottom(ary); tmp <= sa_top(ary); tmp++){
    /* ƥȤ */
    p = sa_aryidx2txtptr(ary, tmp);
    /* Ʊn-gram䤹 */
    if(strncmp(p,nstr,ng) == 0){
      n_ctr++;
    }else{
      if(*nstr != '\0' && !strstr(nstr,"\n"))
	hyouji(nstr,n_ctr,threshold);/* ͰʾΤΤʸɻߤɽ */
      /* ĤΥȥ */
      strncpy(nstr,p,ng);
      n_ctr = 0;
    }
  }

  hyouji(nstr,n_ctr,threshold);/* ͰʾΤΤʸɻߤɽ */

  /*  */
  sa_closefiles(ary);
  
  /* λ */
  return;
}


/**********************************************
 * void hyouji(char *nstr, int n_ctr, int threshold)
 * 
 * purpose
 *   n-gramɽ
 *
 * parameters
 *   nstr : n-gramʸؤΥݥ󥿡
 *   n_ctr : n-gramʸνиĿ
 *   threshold : ɽ(ʲ٤ΤΤɽ)
 *
 * return value
 *   ʤ
 **********************************************/
void hyouji(char *nstr, int n_ctr, int threshold)
{
  /* ͰʾΤΤʸɻߤɽ */
  if(n_ctr >= threshold){
    mojibakebousi(nstr,0);
    printf("%d %s\n",n_ctr+1,nstr);
  }
}


/**************
   Ȥɽ
   **************/
void usage(){
  fprintf(stderr, "Version 0.2  970327  YAMASITA Tatuo (tatuo-y@cl.aist-nara.ac.jp)\nUSAGE   sang -n NUM -t NUM FILENAME\nOPTION\n  -n NUM : N for N-gram\n  -t NUM : threshold\n");
}


/* from show.c(for 'array') */
/**********************************************
 * void mojibakebousi(char *buf);
 *
 * purpose
 *    ʸλĳĤ֤üʸĤ֤
 * parameters
 *    buf:  о
 *    haba: ɤβʸ褫ɤफ
 * return value
 *
 * description
 *
 **********************************************/
int mojibakebousi(char *buf,int haba){
  int i, pre_hankaku, post_hankaku;

  pre_hankaku = 0; /* ɤȾʸο */
  post_hankaku = 0; /* ɤȾʸο */
  for(i = 0; i < strlen(buf); i++) {
    if((unsigned char)buf[i] < 0x80){ /* Ⱦʸ */
      if(i < haba) pre_hankaku++;
      else post_hankaku++;
      if((unsigned char)buf[i] < 0x20) buf[i] = '!'; /* üʸ !  */
    }
  }
  /* ʸɻ: Ƭ˴θʬ褿Ȥ % ˤ롥 */
  if(pre_hankaku % 2 == 1) buf[0] = '%';
  /* : Ǹ˴ʬ褿Ȥ % ˤ롥 */
  if(post_hankaku % 2 == 1 && strlen(buf) % 2 == 0) buf[i-1] = '%';
  if(post_hankaku % 2 == 0 && strlen(buf) % 2 == 1) buf[i-1] = '%';

  return pre_hankaku;
}
