#!/usr/bin/regina
/*
 vim:ts=4:et:wrap:
 $Id: searchdeb,v 1.1 2000/07/05 01:01:44 rick Exp rick $
 * Rick Younie <younie@debian.org>
-+-
USAGE: searchdeb [-l list1 [list2]..] -p pattern

  -b  - download message bodies too
  -D  - save the list names to a local file for quicker startup
  -d  - go this far back; default 1 year if no argument
        [0-4]q  -or-  [0-..]y  -or-  all
  -e  - spelling errors - 0,1,2; default 0
  -H  - pull up lists.debian.org in the web browser
  -k  - keep the hits file on exit (always kept if Netscape is the browser)
  -l  - lists, default devel; all="devel mentors user"
  -L  pat - show all listnames matching pat; '*' for all
  -o  - lineonly; semicolon-separated words must be on same line
  -p  - search pattern; '?' for Glimpse help at lists.debian.org
  -t  - timeout in seconds for socket routine - default 60

  ^C breaks off the search and shows the hits to that point.

PATTERNS:
    "this,that"       - comma finds either term in the file
    "pgp;gpg;signing" - semicolon wants all terms in the file
    "some phrase"     - space finds terms adjacent, as if quoted

EXAMPLES:
   searchdeb -l devel policy -d 7y -p "burned out" -b
   searchdeb -l user devel -p "autoconf;automake"
   searchdeb -d 2q -p "libtool;rpath" -b
-*-
 * 
 */
	TRACE OFF
	SIGNAL ON HALT NAME CTRL_C
	SIGNAL OFF ERROR
	SIGNAL ON FAILURE
	SIGNAL ON NOVALUE
	SIGNAL ON SYNTAX

/* -------------------------------------------------------------------
 *	  constants and assigns
 */
	g.    = ''
	lf	  = '0a'x
	cr	  = '0d'x
	wipe  = '1b'x || '[K' || cr
	default_lists  = 'devel mentors user'
    globals = 'lf cr wipe'

	prefix = '/cgi-bin/searchlists?query='
	suffix = '&lists='
	site   = 'cgi.debian.org'
	default_browser = '/usr/bin/lynx'

/* -------------------------------------------------------------------
 */
MAIN:
	parse arg rgs

	/* ----------------------------------------------
	 *	environment variables
	 * ---------------------------------------------*/

	/* make name of config file for list names */
	homedir = value('HOME',,'SYSTEM')
	if homedir = '' then call EX 1,'..please set HOME env var'
	localdir = homedir'/.searchscripts'
	localnames = localdir'/deb_names'

	/* set browser */
	if value('SSBROWSER',,'SYSTEM') <> ''
		then browser = value('SSBROWSER',,'SYSTEM')
	else do
		if value('DISPLAY',,'SYSTEM') <> ''
			then browser = value('X11BROWSER',,'SYSTEM')
		else browser = value('CONSOLEBROWSER',,'SYSTEM')
	end
	if browser = '' then browser = default_browser
	'type >/dev/null 2>&1' browser	/* it exists after all that? */
	if RC <> 0 then call EX 1,'..you will have to install lynx',
		'or set *BROWSER - please see man page'
	if pos('netscape', browser) <> 0 then g.$keep = 1

	call value 'REGINA_MACROS', '/usr/lib/searchscripts','SYSTEM'

	/* ----------------------------------------------
	 *	parse arguments
	 * ---------------------------------------------*/

	/* print lists and exit if no args */
	if rgs = '-h' | rgs = '--help' then signal USAGE
	else if rgs = '-H' then do
		call DOBROWSER 'http://www.debian.org/Lists-Archives/'
		exit 0
	end
	/* allow missing -p if that's the only switch */
	else if pos(' -',rgs) = 0 then rgs = '-p' rgs

	opt. = 0
	listpatterns = ''
	rgs = ' 'rgs
	do while rgs <> ''
		parse var rgs ' -' option ' -' +0 rgs

		parse var option opt val
		select
			when opt = 'b' then opt.$bodiestoo = 1
			when opt = 'D' then downloadnames = 1
			when opt = 'd' then g.$datestring = val
			when opt = 'e' then g.$errors = val
			when opt = 'k' then g.$keep = 1
			when translate(opt) = 'L' then do
				if opt = 'L' then opt.$findmatch = 1
				if val = '*' | val = '?' then opt.$justprint = 1
				listpatterns = val
			end
			when opt = 'm' then g.$maxmsgs = val
			when opt = 'o' then g.$lineonly = 'on'
			when opt = 'p' then g.$pattern = val
			when opt = 'r' then g.$partial = val
			when opt = 't' then g.$timeout = val
			otherwise call EX 1,'..arg error "-'opt'"'
		end
	end
    posall = wordpos('all',listpatterns)
    if posall <> 0 then listpatterns = default_lists delword(listpatterns,posall,1)
	lists = listpatterns

	/* simple checks for bad input */
	if \datatype(g.$errors,'NUM') then g.$errors = 0
	if \datatype(g.$maxmsgs,'NUM') then g.$maxmsgs = 100
	if \datatype(g.$maxlines,'NUM') then g.$maxlines = 10 /* enable? */
	if \datatype(g.$timeout,'NUM') then g.$timeout = 60
	if g.$lineonly <> 'on' then g.$lineonly = 'off'
	if g.$partial <> 'on' then g.$partial = 'off'

	otheropts = '&errors='g.$errors		  ||,
				'&maxfiles='g.$maxmsgs	  ||,
				'&maxlines='g.$maxlines	  ||,
				'&lineonly='g.$lineonly	  ||,
				'&partial='g.$partial

	/* encode chars in the search pattern */
	transpat = ''
	do i = 1 to length(g.$pattern)
		char = substr(g.$pattern, i, 1)
		select
			when char >= 'a' & char <= 'z' then nop
			when char >= 'A' & char <= 'Z' then nop
			when char >= '0' & char <= '9' then nop
			otherwise char = '%'c2x(char)
		end
		transpat = transpat || char
	end i
	g.$pattern = transpat
	
	/* ----------------------------------------------
	 *	main
	 * ---------------------------------------------*/

	if value('downloadnames') = '1' then do
		if stream(localdir,'C','QUERY EXISTS') = '' then do
			'mkdir' localdir
			if RC <> 0 then call EX 1,'..could not create' localdir
		end

		'rm 2>/dev/null' localnames
		call GET_LISTNAMES 'lists.debian.org','/search.html',g.$timeout,,1
		words_per_line = 100
        all_lists = strip(all_lists,'T') /* subword leaves trailer */
		do i = 1 to words(all_lists) by words_per_line
			call lineout localnames,subword(all_lists,i,words_per_line)
		end
		say
		exit
	end

    /* pull listnames, print and exit "-L '*'" */
	if opt.$justprint then do
        call GET_LISTNAMES 'lists.debian.org','/search.html',g.$timeout
        'LIFO> sort >FIFO'
        do i = 1 to queued()
            parse pull n
            call charout , n'   '
        end
        say
        exit
    end

	/* load names from local file if exists else pull/scrape the page */
	if stream(localnames,'C','QUERY EXISTS') <> '' then do
		all_lists = ''
		do while lines(localnames) <> 0
			all_lists = all_lists linein(localnames)
		end
		all_lists = all_lists' '
		call stream localnames,'C','CLOSE'
	end
    else call GET_LISTNAMES 'lists.debian.org','/search.html',g.$timeout

	if opt.$findmatch then do
		call FIND_MATCH lists
		exit
	end

	if g.$pattern = '' then call EX 1,'..no pattern'

    /* "-p '?'" - it's been encoded */
	if g.$pattern = '%3F' then do
		call DOBROWSER 'http://lists.debian.org/glimpse.html#sect7'
		exit 0
	end

	call MAKETEMPFILES

	dates = MAKEDATESTRING(g.$datestring)

	if lists = '' then listnames = 'devel'	  /* default */
	else do
		listnames = ''
		do i = 1 to words(lists)
			list = word(lists, i)
			if pos(' 'list' ',all_lists) = 0
			    then call EX 1,'..no list "'list'"'
			listnames = listnames list
		end i
	end

	/* do for each list */
	got_hits = 0
	do i = 1 to words(listnames)

		list = word(listnames,i)
		call charout 'stdout', wipe || list': '
		call lineout indexfile, '<pre>'lf||lf'  <b>'list'</b>'lf

		request = prefix || g.$pattern || otheropts || suffix || list || dates

        /* try to salvage on error; check return at end of loop */
		ret = GETPAGE(site,request,g.$timeout,,1)

		/* split returned buffer into individual messages (each msg
			body is the few line match) tagged with date+msgnumber */
		call PARSEPAGE sockbuff

		/* sort msgnumbers.i (==date||l.d.o msgnumber) */
		call SORT

		if msgnumbers.0 = 0 then do
			if pos('<h2>No archives match', sockbuff) <> 0
				then errmsg = '  ..no such list'
				else errmsg = '  ..no hits'
			say cr || wipe || list':' errmsg
			call lineout indexfile, '    'errmsg
		end
		else do
			got_hits = 1
			say cr || wipe || list': 'msgnumbers.0 'hits'
			/* not -b, print the hits as we go */
			if \opt.$bodiestoo then do
				do j = 1 to msgnumbers.0
					msgnum = msgnumbers.j
					call lineout indexfile, body.msgnum
				end
			end
			/* if ^C or other error only show what we have so far */
			else if ret = 0 then pullret = PULL_BODIES()
		end

		/* leave if there was a ^C or socket error */
		if ret <> 0 | value('pullret') = 1 then leave
	end

/* if ^C in procedure, filenames might not be defined */
CTRL_C:
	if symbol('indexfile') <> 'LIT' then do
		call stream indexfile, 'C', 'CLOSE'
		if opt.$bodiestoo & symbol('bodyfile') <> 'LIT' then do
			call lineout bodyfile, lf'</body>'lf'</html>'
			call stream bodyfile, 'C', 'CLOSE'
			'cat 2>/dev/null' bodyfile '>>' indexfile
		end

		if got_hits then call DOBROWSER 'file://'indexfile
	end

    call CLEANUP

	exit 0


/* -------------------------------------------------------------------
 *  -b is chosen: SECTION index urls -> indexfile, bodies -> bodyfile
 *
 *	RETURNS
 *		indexfile, bodyfile
 */
PULL_BODIES: PROCEDURE EXPOSE (globals) g.$timeout msgnumbers. url.,
    indexfile bodyfile list

    SIGNAL ON HALT NAME RET1

	do i = 1 to msgnumbers.0 /* i also for inserting blank line in WRIT.. */
		msgnum = msgnumbers.i
		call charout ,cr || right(i,6) msgnumbers.0 || wipe

		lastsl = lastpos('/',url.msgnum) + 1
		parse var url.msgnum threadurl =(lastsl) .

		parse var url.msgnum site '/' +0 request
		ret = GETPAGE(site,request,g.$timeout)
        if ret <> 0 then leave i

		/* section urls -> indexfile; bodies -> a string var */
		call WRITE_INDEX_BODY sockbuff
	end i

	return 0
RET1:
    return 1

WRITE_INDEX_BODY:

	parse arg wholemessage

    /* newer pages have gone to lowercase tags */
    /* should really use the comment tags...investigate */
    if pos('<HR>',wholemessage) <> 0 then div = '<HR>'
    else div = '<hr>'
	parse var wholemessage  . (div) threadpara (div) head (div) body (div)

    parse var threadpara '>Date Index<' . '="' thread_tag '">'
	parse var head '>Subject<' ': ' subject '</'
	parse var head '>From<' ': ' from (lf)
	parse var head '>Date<' ': ' date '</'

    if pos('&',subject) <> 0 then subject = UNMARK(subject)

	/* clean up date for index display */
	cleandate = ''
	do j = 1 to words(date)
		w = word(date,j)
		if pos(',',w)<>0 then iterate
		if pos(':',w)<>0 then iterate
		if pos('-',w)<>0 then iterate
		if pos('+',w)<>0 then iterate
		cleandate = cleandate w
	end
	cleandate = space(cleandate,0)
	/* lose any (TZ) */
	if pos('(',cleandate)<>0 then parse var cleandate cleandate '('

	/* try to extract address for index display */
	cleanfrom = '??'
	if pos('">',from)<>0 then parse var from '">' cleanfrom '</'
	else if pos('mailto',from)<>0 then parse var from 'mailto:' cleanfrom '"'
	else if pos('"',from)<>0 then parse var from '"' cleanfrom '"'

	/* write SECTION url to its file */
	out = right('0'cleandate,9) '<a href="#SECTION'msgnum||,
		'">'left(subject,45)'</a>'
    if length(cleanfrom) <= 23 then out = out cleanfrom
    else out = out left(cleanfrom,23)
	call lineout indexfile, out
	if i // 10 = 0 then call lineout indexfile,'' 

	/* write body to its file */
	call lineout bodyfile, '<hr><hr>'lf'<pre><a name="SECTION'msgnum'"></a>'
	header= '  <em>Subject</em>:' subject||lf||,
			'  <em>From</em>   :' cleanfrom||lf||,
			'  <em>Date</em>   :' cleandate||lf||,
			'  <em>List</em>   :' list||lf||,
			'  <a href="http://'threadurl||thread_tag'"><strong>Thread</strong></a>'
	call lineout bodyfile, strip(header,'B',lf)||lf
	call lineout bodyfile, strip(body,'B',lf)||lf||lf

	return

/* FIXME: */
UNMARK: PROCEDURE EXPOSE (globals)

    parse arg sub
    sub = changestr('&amp;',sub,'&')
    sub = changestr('&lt;',sub,'<')
    sub = changestr('&gt;',sub,'>')
    return sub

/* -------------------------------------------------------------------
 * breaks the buffer up into separate posts and
 *	adds them to the msgnumbers. array
 * RETURNS:
 *	body.(msgnumber.i) - the bodies are the usual web page return of
 *		the several lines that contain a search term, prefixed with the
 *		line number.  Used if -b not selected
 *	msgnumbers.i - the array of message numbers, unique for each message
 *		0 < i < msgnumbers.0 (composed of the date and the l.d.o msg number)
 *	url.(msgnumber.i) - l.d.o urls used to pull the full msg, in case -b
 *
 */
PARSEPAGE: PROCEDURE EXPOSE (globals) body. msgnumbers. url.

	parse arg '<ul>' sockbuff '</ul>'

	i = 0
	do while sockbuff <> ''
		parse var sockbuff '<li>' msg '<li>' +0 sockbuff

		parse var msg '<a href="http://' url '">' title '<li>'

		/* assign to stem using date || msg# as the index
			so we can sort the messages by date, then number */
		parse var url junk_date '/msg' msgnumber '.html'
		date = right(junk_date,4)
		msgnum = date || msgnumber

		/* so year 20xx hits get sorted after 19xx (until 2095, at least) */
		if left(msgnum,2) < 95 then msgnum = '1'msgnum

		i = i + 1
		msgnumbers.i = msgnum
		body.msgnum = '<li>'changestr(lf||lf,msg,lf)
		url.msgnum = url
	end
	msgnumbers.0 = i

	return

/* -------------------------------------------------------------------
 * find list name matches
 */
FIND_MATCH: PROCEDURE EXPOSE (globals) all_lists

	parse arg pats

	do i = 1 to words(pats)
		pat = word(pats,i)
		call charout ,pat': '
		if pos('*',pat) <> 0 then do
			firstch = left(pat,1)
			lastch = right(pat,1)
			pat = strip(pat,'B','*')
			if firstch = '*' & lastch = '*' then nop
			else if firstch = '*' then pat = pat' '
			else pat = ' 'pat
		end
		mp = pos(pat,all_lists)
		if mp = 0 then  call charout ,'..no match'
		else do while mp <> 0
			hitst = lastpos(' ', all_lists, mp)
			hitend = pos(' ', all_lists, mp + 1)
			call charout ,strip(substr(all_lists,hitst,hitend-hitst),'B')'  '
			mp = pos(pat, all_lists, hitend)
		end
		say
	end i

	return

/* -------------------------------------------------------------------
 * RETURNS:
 *	lists. - groups are indented 2 spaces, listnames not
 *  all_lists - a string of all listnames
 *
 */
GET_LISTNAMES: PROCEDURE EXPOSE (globals) opt. all_lists

	parse arg site,url,timeout,,dots

	all_lists = ''

	ret = GETPAGE(site,url,timeout,,dots)

    if ret = 0 then do
        parse var sockbuff '"lists"' . '<option>' +0 listblock '</select>'
        do while listblock <> ''
            listblock = changestr(lf,listblock,' ')
            parse var listblock '<option>' listname '<option>' +0 listblock
            if listname = '' then leave
            all_lists = all_lists listname
            if opt.$justprint then push listname
        end
        all_lists = all_lists' '
    end

	return

/* -------------------------------------------------------------------
 * get a web page with the PULLPAGE external socket function
 * RETURN:  0 if no error
 *			sockbuff is the page, a global
 *
 */
GETPAGE: PROCEDURE EXPOSE (globals) sockbuff

	SIGNAL ON HALT NAME CTRL_3
	parse arg site,url,timeout,header,dots

	/* pull a page of hits */
	return = PULLPAGE(site, url, timeout, header, dots)
	parse var return errnum ',' sockbuff

	if errnum <> 0 then do
		call charout ,cr || wipe		 /* lose the dots */
		if errnum = -3
            then call EX 1,cr || wipe || '..no rxsock function library'
		else if errnum = -2 then say '  ..timeout'
		else if errnum > 0 then say '  ..socket error:' errnum
	end

	return errnum   /* -1 = ^C hit in the external func */
CTRL_3:
	return -1

/* -------------------------------------------------------------------
 */
MAKEDATESTRING: PROCEDURE EXPOSE (globals)

	parse arg pat

	if pat = '' then pat = '1y'
	else if pat = 'all' then pat = '120q' /* wayback; truncated later */

	lenpat = length(pat)
	parse var pat num =(lenpat) type
	select
		when type = 'y' then q_back = num * 4
		when type = 'q' then q_back = num
		otherwise
			call EX 1,' Usage:  -d <num>q  or  -d <num>y  or  no argument'
	end

	parse value date('S') with cur_year +4 mon +2 .
	cur_quarter = cur_year * 4 + (mon-1) % 3
	st_quarter = cur_quarter - q_back

	/* lists.d.o/search.html starts at Oct 1997 so use that */
	archive_start = 1997*4 + 3
	if st_quarter < archive_start then st_quarter = archive_start

	parse value MONTH_YEAR(st_quarter) with months','year
	say ' ['left(months,3)'-'year 'to today]'

	dates = ''
	do i = st_quarter to cur_quarter
		parse value MONTH_YEAR(i) with months','year
		dates = dates || '&dates='months'+'year
	end

	return dates

MONTH_YEAR: PROCEDURE EXPOSE (globals)

	parse arg quarter

	quarter_names = 'Jan+to+Mar Apr+to+Jun Jul+to+Sep Oct+to+Dec'
	yr = quarter % 4
	q_index = quarter//4 + 1

	return word(quarter_names, q_index)','yr

/* -------------------------------------------------------------------
 */
DOBROWSER: PROCEDURE EXPOSE (globals) browser

	parse arg url

	if pos('netscape', browser) = 0 then browser url
	else do		 /* Netscape */
		lockfile = value('HOME',,'SYSTEM')'/.netscape/lock'
		'[ -h' lockfile ']'
		/* if lockfile is set use existing Netscape */
		if RC = 0 then browser '-remote "openURL('url')" 2>/dev/null'
		else do
			browser url '2>/dev/null&'
			call sleep 5
		end
	end

	return

/* -------------------------------------------------------------------
 * RETURNS:
 *		indexfile - name of file for SECTION urls
 *		bodyfile - name of file for message bodies
 */
MAKETEMPFILES: PROCEDURE EXPOSE (globals) opt. bodyfile indexfile

	/* start the index temp file */
	call popen 'tempfile -p deb__ -d /tmp -s .html 2>/dev/null', 'tf.'
	if tf.0 <> 1 then call EX 1,'..could not create tempfile in /tmp'
	indexfile = tf.1
	call lineout indexfile,,
		'<html>'lf'<head>'lf'<title>Archive Search Results</title>'
	call lineout indexfile, '</head>'lf'<body>'

	if opt.$bodiestoo then do
		/* start the bodies tempfile */
		call popen 'tempfile -p deb__ -d /tmp -s .html 2>/dev/null', 'tf.'
		if tf.0 <> 1 then call EX 1,'..could not create tempfile in /tmp'
		bodyfile = tf.1
	end

	return

/* -------------------------------------------------------------------
 * RETURNS:
 *	  the sorted msgnumbers. stem array
 */
SORT: PROCEDURE EXPOSE (globals) msgnumbers.

	m = 1
	do while (9 * m + 4) < msgnumbers.0
		m = m * 3 + 1
	end

	do while m > 0
		k = msgnumbers.0 - m
		do j = 1 to k
			q = j
			do while q > 0
				l = q + m
				if msgnumbers.q >= msgnumbers.l then leave
				tmp = msgnumbers.q
				msgnumbers.q = msgnumbers.l
				msgnumbers.l = tmp
				q = q - m
			end
		end
		m = m % 3
	end

	return

/* -------------------------------------------------------------------
 * error trapping, usage
 */
USAGE: 
	do i = 3 to 10
		line = sourceline(i)
		if left(line,3) = '-+-' then do j = i + 1 for 25
			line = sourceline(j)
			if left(line,3) = '-*-' then leave i
			if left(line,1) <> '!' then say line
			else interpret substr(line,2)
		end j
	end i
	return 0

NOVALUE:
FAILURE:
SYNTAX:
ERROR:
HALT:
	err = condition('C') 'error, line' SIGL': "'condition('D')'"'
	call lineout 'stderr', err
	call CLEANUP
	exit 1
EX:
	parse arg ret,err
	call lineout 'stderr', err
    call CLEANUP
	exit ret
CLEANUP:
    if symbol('bodyfile') <> 'LIT' then 'rm 2>/dev/null' bodyfile
	if value('g.$keep') <> '1' then if symbol('indexfile') <> 'LIT'
		then 'rm 2>/dev/null' indexfile
    return
