#!/usr/bin/regina
/*
 vim:ts=4:noet:nowrap:sts=4:sw=4:
 $Id: searchgoo,v 1.39 2002/04/10 07:23:50 rick Exp $
 * Rick Younie <younie@debian.org>
-+-
searchgoo - search Google's usenet archive

  -H  - pull the Google usenet home page up in the browser
  -a  - author
  -b  - download message bodies too
  -c  - commence (start) date: yyyy mm dd; default is 2 years ago
  -e  - end date: yyyy mm dd, default is the current date; 'all' for all
  -nn - search the last nn days, e.g. -30; -c and -e are ignored
  -g  - restrict groups; trailing '*' is the only wildcard allowed
  -k  - keep the hits file (in /tmp); always kept if Netscape is your browser
  -m  - Message-Id
  -o  - optional; any of these words
  -p  - pattern; include all of these words
  -q  - phrase, as if quoted
  -s  - subject
  -t  - timeout in seconds, default 60
  -u  - unfiltered, bypasses Google's filter of like messages
  -x  - exclude these words
  -l  - language:

    cs  Czech       hu  Hungarian      no  Norwegian
    da  Danish      is  Icelandic      pl  Polish
    de  German      it  Italian        pt  Portuguese
    el  Greek       iw  Hebrew         ro  Romanian
    en  English     ja  Japanese       ru  Russian
    es  Spanish     ko  Korean         sv  Swedish
    et  Estonian    lt  Lithuanian     zh-CN  Chinese(Simplified)
    fi  Finnish     lv  Latvian        zh-TW  Chinese(Traditional)
    fr  French      nl  Dutch

-*-
 *
 */
	TRACE OFF
	SIGNAL ON HALT
	SIGNAL OFF ERROR
	SIGNAL ON FAILURE
	SIGNAL ON NOVALUE
	SIGNAL ON SYNTAX

/* -------------------------------------------------------------------
 *	  constants
 */
	lf		= '0a'x
	cr		= '0d'x
	wipe	= '1b'x || '[K' || cr
	hits_per_page = 100
	maxhits = 2000
	maxhitswithbodies = 500
	donefirst = 0
	totalhits = 0

	site	= 'groups.google.com'
	page	= '/advanced_group_search'
	default_browser = '/usr/bin/lynx'
	lang_abbrevs = 'cs da de el en es et fi fr hu is it iw ja ko lt',
					'lv nl no pl pt ro ru sv zh-CN zh-TW'

	/* included are vars/stems that have to be exposed through one
		procedure to get to another */
	global_vars = 'opt. lf cr wipe totalhits'

/* -------------------------------------------------------------------
 */
MAIN:

	parse arg rgs

	/* ----------------------------------------------
	 *	handle environment variables
	 * ---------------------------------------------*/

	call popen 'tempfile -p goo__ -d /tmp -s .html 2>/dev/null', 'tf.'
	if tf.0 <> 1 then call EX 1,'..could not create tempfile in /tmp'
	savefile = tf.1

	/* set browser */
	opt. = ''
	if value('SSBROWSER',,'SYSTEM') <> ''
		then opt.$browser = value('SSBROWSER',,'SYSTEM')
	else do
		if value('DISPLAY',,'SYSTEM') <> ''
			then opt.$browser = value('X11BROWSER',,'SYSTEM')
		else opt.$browser = value('CONSOLEBROWSER',,'SYSTEM')
	end
	if opt.$browser = '' then opt.$browser = default_browser
	'type >/dev/null 2>&1' opt.$browser	/* it exists after all that? */
	if RC <> 0 then call EX 1,'..you will have to install lynx',
		'or set *BROWSER - please see man page'
	if pos('netscape', opt.$browser) <> 0 then opt.$keep = 1

	call value 'REGINA_MACROS', '/usr/lib/searchscripts','SYSTEM'

	/* ----------------------------------------------
	 *	parse arguments
	 * ---------------------------------------------*/

	if rgs = '-h' | rgs = '--help' then signal USAGE
	else if rgs = '-H' then do
		call DOBROWSER 'http://'site||page
		exit 0
	end
	/* allow no -p if that's the only switch */
	else if pos(' -',rgs) = 0 then rgs = '-p' rgs

	/* parse the options */
	listpatterns = ''
	rgs = ' 'rgs
	do while rgs <> ''
		parse var rgs ' -' option ' -' +0 rgs

		parse var option opt val
		select
			when opt = 'a' then opt.$author = val
			when opt = 'b' then opt.$bodiestoo = 1
			when opt = 'c' then opt.$commence = val
			when opt = 'e' then opt.$end = val
			when opt = 'F' then opt.$force = 1
			when opt = 'g' then opt.$group = val
			when opt = 'k' then opt.$keep = 1
			when opt = 'l' then opt.$language = val
			when opt = 'm' then opt.$messageid = val
			when opt = 'o' then opt.$optional = val
			when opt = 'p' then opt.$pattern = val
			when opt = 'q' then opt.$quoted = val
			when opt = 's' then opt.$subject = val
			when opt = 't' then opt.$timeout = val
			when opt = 'u' then opt.$unfiltered = '&filter=0'
			when opt = 'x' then opt.$exclude = val
			when datatype(opt,'WHOLE') then opt.$daysago = abs(opt)
			otherwise call EX 1,'..arg error "-'opt'"'
		end
	end
	/* sanity checks */
	if datatype(opt.$timeout) <> 'NUM' then opt.$timeout = 60
	if opt.$keep <> 1 then opt.$keep = 0
	if opt.$bodiestoo <> 1 then opt.$bodiestoo = 0
	if opt.$force <> 1 then opt.$force = 0
	if opt.$language <> '' then do
		if wordpos(opt.$language, lang_abbrevs) = 0
			then call EX 1,'..unknown language abbrev "'opt.$language'"'
		opt.$language = 'lang_'opt.$language
	end
	if opt.$group = 'deb' then opt.$group = 'linux.debian.*'

	/* ----------------------------------------------
	 *	set the date limits
	 * ---------------------------------------------*/

	if opt.$commence = '' & opt.$end = '' & opt.$daysago = ''
	  then opt.$quick = 'quick'
	  else opt.$quick = 'between'

	parse value date('S') with yyyy +4 mm +2 dd
	today = yyyy mm dd
	if opt.$commence = 'all' | opt.$end = 'all' then do
		opt.$commence = '1995 02 31'
		opt.$end = today
	end
	else if opt.$daysago <> '' then do
		opt.$end = today
		parse value date('S',date('B') - opt.$daysago,'B') with yyyy +4 mm +2 dd
		opt.$commence = yyyy mm dd
	end
	else do
		opt.$commence = space(translate(opt.$commence,' ',':/.-'))
		opt.$end = space(translate(opt.$end,' ',':/.-'))
		if opt.$end = '' then opt.$end = today
		if opt.$commence = '' then opt.$commence = yyyy-2 mm dd
	end
	if \datatype(space(opt.$commence,0,' '),'NUM')
		then call EX 1,'..not a valid date "'opt.$commence'"' 
	if \datatype(space(opt.$end,0,' '),'NUM')
		then call EX 1,'..not a valid date "'opt.$end'"' 

	say 'Search:' opt.$commence '->' opt.$end

	/* ----------------------------------------------
	 *	pull/parse/display the pages
	 * ---------------------------------------------*/

	nexturl = MAKE_QUERY()

	ix = 0  /* index thru bodyurl./index. if -b; else index thru body. */
	do i = 1
		loop_ret = GETPAGE(site,nexturl,opt.$timeout)
		if loop_ret <> 0 then leave i

		/* FIXME: this still work? need a search that returns 1 hit */
		/* if only one hit, add Google base url and show it raw */
		if i = 1 then do
			if pos('Results <b>1 - 1</b> of <b>1</b>',sockbuff) <> 0 then do
				parse var sockbuff hed '</head>' bod
				call charout savefile, hed||lf'<BASE href="http://groups.google.com"></head>'lf||bod
				SIGNAL MSG_ID 
			end
		end

		parse value GET_HITSBLOCK_AND_NEXTURL() with loop_ret','nexturl
		if loop_ret <> 0 then leave i

		/* GET_HITSBLOCK_AND_NEXTURL() has left us with a block of only hits */
		if \opt.$bodiestoo then call SCRAPE_MSGS hitsblock
		else call MAKE_BODY_INDEX hitsblock

		if ix <> 0 then call charout ,cr||right(ix,6) totalhits||wipe

		if nexturl = '' then leave i	/* not -b; no more pages */
	end i
	say	/* leave hits displayed */

	/* if -b & the url pull was aborted, there's nothing else we can do.
	 * The urls will be massaged to 'a names' pointing into the file and
	 * we were asked to abort so we can't pull bodies so satisfy those links.
	 */
	if loop_ret <> 0 & opt.$bodiestoo then call EX 1,'..aborting'

	if opt.$bodiestoo then do

		/* write the in-file links */
		do i = 1 to ix
			call lineout savefile, aname.i
			if i // 10 = 0 then call lineout savefile,''
		end
		call lineout savefile, '</pre>'
		call lineout savefile, '<BASE href="http://groups.google.com">'

		/* fetch/write the bodies corresp. to those link #s */
		do i = 1 to ix
			call charout ,cr || right(i,6) ix || wipe

			call lineout savefile, lf'<hr><A NAME="'i'"></A><hr>'
			parse value PULL_BODY(bodyurl.i) with ret','body
			if ret <> 0 then leave
			
			call lineout savefile, body'</pre>'lf
		end

	end

	/* not -b; the queued msg #'s index the Google msg summaries in body. */
	else do
		do i = 1 to ix
			call lineout savefile, '<hr>'
			call lineout savefile, body.i
		end
		call lineout savefile, '<hr>'lf'</table>'
	end

	call lineout savefile, '</body>'
MSG_ID:
	call stream savefile,'C','CLOSE'

	if totalhits > 0 then call DOBROWSER savefile
	else say '..no hits'

	call CLEANUP

	exit 0


/* -------------------------------------------------------------------
 * RETURNS:
 *	the '>Next<' url
 */
GET_HITSBLOCK_AND_NEXTURL: PROCEDURE EXPOSE (global_vars) sockbuff,
	site maxhitswithbodies maxhits donefirst savefile hitsblock

	SIGNAL ON HALT NAME CTRL_4

	if \donefirst then do
		donefirst = 1

		/* 1st block parses different - get hits block and Next block */
		parse var sockbuff head '</head>' no_hitsblock '>Results' numhitsblock '<p><a href=' +0 hitsblock '<div class=n' nextblock

		/* no hits? */
		if pos('<br>Your search',no_hitsblock) <> 0
			then call EX 1,'..no hits'

		/* number hits:      <b>1</b> - <b>21</b> of [about] <b>2,100</b>. */
		/* - this changes often; parsing from the end is more robust */
		/* end space nec. If searching for "autoconf2.13" - it matches ... */
		/* "autoconf 2</i></b>.13" instead of "about <b>146</b>.   Search" */
		stmark = reverse('</b>. ')
		parse value reverse(numhitsblock) with (stmark) totalhits '>'
		totalhits = changestr(',',reverse(totalhits),'')

		if \opt.$force then do
			if totalhits > maxhits then call EX 1,'.. >'maxhits,
				'hits ('totalhits'); please narrow search'
			if opt.$bodiestoo then if totalhits > maxhitswithbodies
				then call EX 1,'.. >'maxhitswithbodies 'hits ('totalhits')',
				'and -b switch; please narrow search'
		end

		/* print head w. base url */
		call lineout savefile, strip(head,'B',lf)
		call lineout savefile, '</head>'lf'<body>'
		if opt.$bodiestoo then call lineout savefile, '<pre>'
		else do
			call lineout savefile, '<table>'
			call lineout savefile, '<BASE href="http://groups.google.com">'
		end
	end
	else do
		parse var sockbuff . '<p><a href' +0 hitsblock '<div class=n' nextblock
	end

	/* extract the next url if any */
	nexturl = ''

	do while nextblock <> ''
		parse var nextblock line (lf) nextblock
		if pos('>NEXT<',translate(line)) = 0 then iterate
		href_idx = lastpos('<a href=',line)
		if href_idx <> 0 then line = substr(line,href_idx)
		parse var line '<a href=' nexturl '>'
		nexturl = strip(nexturl,'B','"')
		leave
	end

	return '0,'nexturl
CTRL_4:
	return -1',""'

/* -------------------------------------------------------------------
 * RETURNS:
 *	body.  - stem array of post digests indexed by number, 1 -> ix
 */
SCRAPE_MSGS: PROCEDURE EXPOSE (global_vars) body. ix

	parse arg hits

	st = '<p><a href'
	parse var hits (st) hits
	do while hits <> ''
		parse var hits stanza (st) hits

		stanza = changestr('<br>',stanza,' ')
		stanza = changestr(lf,stanza,' ')
		stanza = changestr('<center>',stanza,' ')
		stanza = changestr('<a href=',stanza,'<br><a href=')

		parse var stanza stanza '<p>' .
		stanza = strip(stanza,'B',lf)

		ix = ix + 1
		body.ix = st || stanza || lf
	end

	return

/* -------------------------------------------------------------------
 * RETURNS
 *	aname.  - stem array of in-page links
 *	bodyurl. - array of message urls, 1 -> ix
 */
MAKE_BODY_INDEX: PROCEDURE EXPOSE (global_vars) aname. bodyurl. ix

	parse arg hits

	st = '<p><a href='
	parse var hits (st) hits
	do while hits <> ''
		parse var hits stanza (st) hits

		parse var stanza url '>' subj '</a>' adstring
		subj = strip(subj)
		if pos('<b>',subj) <> 0 then do
			subj = changestr('<b>',subj,'')
			subj = changestr('</b>',subj,'')
		end

		parse value reverse(adstring) with '-' rauth ' yb ' rdate '-'
		parse value reverse(rdate) with dd mm yyyy .
		date = dd || mm || right(yyyy,2)

		parse value reverse(rauth) with author '-'
		author = strip(author)
		if pos('&',author) <> 0 then author = UNMARK(author)

		ix = ix + 1
		bodyurl.ix = url

		ind = date left(author,20)
		if length(subj) <= 50 then aname.ix = ind '<a href="#'ix'">'subj'</A>'
		else aname.ix = ind '<a href="#'ix'">'left(subj,50)'</A>'

	end

	return

UNMARK: PROCEDURE EXPOSE (global_vars)

	parse arg line

	table = '&amp; & &lt; < &gt; > &quot; " &#39; '''
	do i = 1 to words(table) by 2
		markup = word(table,i)
		line = changestr(word(table,i),line,word(table,i+1))
		if pos('&',line) = 0 then leave
	end i

	return line

/* -------------------------------------------------------------------
 */
PULL_BODY: PROCEDURE EXPOSE (global_vars) site

	parse arg url

	ret = GETPAGE(site,url,opt.$timeout)

	/* this catches a google internal error */
	errmsg = '>Message id or article number'
	parse var sockbuff (errmsg) err '<'
	if err <> '' then buff = substr(errmsg err,2)
	else parse var sockbuff '<tr><td>' . '</table>' buff '</pre>'

	return ret','buff

/* -------------------------------------------------------------------
 *
 */
MAKE_QUERY: PROCEDURE EXPOSE (global_vars) hits_per_page

	parse var opt.$commence yr mo dy
	cdate = '&as_mind='dy'&as_minm='mo'&as_miny='yr
	parse var opt.$end yr mo dy
	edate = '&as_maxd='dy'&as_maxm='mo'&as_maxy='yr

	string =	'/groups?as_q='MARKUP(opt.$pattern) ||,
				'&num='hits_per_page'&as_scoring=date&btnG=Google+Search' ||,
				'&as_oq='MARKUP(opt.$optional) ||,
				'&as_epq='MARKUP(opt.$quoted) ||,
				'&as_eq='MARKUP(opt.$exclude) ||,
				'&as_ugroup='MARKUP(opt.$group) ||,
				'&as_usubject='MARKUP(opt.$subject) ||,
				'&as_uauthors='MARKUP(opt.$author) ||,
				'&as_umsgid='MARKUP(opt.$messageid) ||,
				'&lr='opt.$language ||,
				'&as_qdr=&as_drrb='opt.$quick ||,
				cdate || edate || opt.$unfiltered

	return string

MARKUP: PROCEDURE EXPOSE (global_vars)

	parse arg unmarked
	marked = changestr(' ',space(unmarked),'+')
	return marked

/* -------------------------------------------------------------------
 * get a web page with the PULLPAGE external socket function
 * RETURN:  0 if no error
 *			sockbuff is the page
 */
GETPAGE: PROCEDURE EXPOSE (global_vars) sockbuff

	SIGNAL ON HALT NAME CTRL_3

	parse arg site,url,timeout,header,dots

	/* pull a page of hits */
	return = PULLPAGE(site, url, timeout, header, dots)
	parse var return errnum ',' sockbuff

	if errnum <> 0 then do
		if errnum = -3 then say '  ..no rxsock function library'
		else if errnum = -2 then say '  ..timeout'
		else if errnum > 0 then say '  ..socket error:' errnum
	end

	return errnum
CTRL_3:
	return -1

/* -------------------------------------------------------------------
 * 
 */
DOBROWSER: PROCEDURE EXPOSE (global_vars)

	parse arg url

	if pos('netscape', opt.$browser) = 0 then opt.$browser url
	else do		 /* Netscape */
		lockfile = value('HOME',,'SYSTEM')'/.netscape/lock'
		'[ -h' lockfile ']'
		/* if lockfile is set use existing Netscape */
		if RC = 0 then opt.$browser '-remote "openURL('url')" 2>/dev/null'
		else do
			opt.$browser url '2>/dev/null&'
			call sleep 5
		end
	end

	return

/* -------------------------------------------------------------------
 * if the usage line starts with '!' it is interpreted
 *	to allow variable substitution
 */
USAGE:
	do i = 3 to 10
		line = sourceline(i)
		if left(line,3) <> '-+-' then iterate i
		do j = i + 1 for 50
			line = sourceline(j)
			if left(line,3) = '-*-' then leave i
			if left(line,1) <> '!' then say line
			else interpret substr(line,2)
		end j
	end i
QUIT:
	exit 0

NOVALUE:
FAILURE:
SYNTAX:
ERROR:
HALT:
	call lineout 'stderr',,
		 condition('C') 'error, line' SIGL': "'condition('D')'"'
	call CLEANUP
	exit 1
EX:
	parse arg ret,err
	call lineout 'stderr', err
	call CLEANUP
	exit ret
CLEANUP:
	if value('opt.$keep') <> '1' then if symbol('savefile') <> 'LIT'
		then 'rm 2>/dev/null' savefile
	return
