#!/bin/bash
# gr.  Generated from gr.in by configure.
# This is the front-end to An Gramadir, an Irish language grammar checker.
# Copyright (C) 2003 Kevin P. Scannell <scannell@slu.edu>
#
# This is free software; see the file COPYING for copying conditions.  There is
# NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Usage: gr [OPTIONS] [FILES]
#
# This is a script for checking the grammar of Irish language text
# contained in the files specified on the command line, or read from
# standard input if no arguments are given.   The default behavior is
# to write a summary of possible errors to standard output.
#
# There are several command line options.  See the web page
# http://borel.slu.edu/gramadoir/
# for a description of these, or use "gr --help"

# NOT internationalized
PACKAGE_NAME="gramadoir"
packageversion="0.4"
bugemail="scannell@slu.edu"
TEXTDOMAINDIR="/usr/share/locale"
TEXTDOMAIN="${PACKAGE_NAME}"
TEANGACHA="ga en "
DATADIR="/usr/share/${PACKAGE_NAME}"
export TEXTDOMAINDIR TEXTDOMAIN

# LC_ALL overrides others, and since I'm about to turn it off,
# need to pass its value down to the environment variables needed by gettext
if [ -n "${LC_ALL}" ]
then
	export LC_MESSAGES="${LC_ALL}"
	export LC_CTYPE="${LC_ALL}"
fi

# must turn off LC_ALL since LC_MESSAGES != LC_COLLATE/LC_CTYPE
export LC_ALL=""
export LC_COLLATE="C"

EXECUTABLE="${0##/*/}"
HELPCOMMAND="${EXECUTABLE} --help"

# TRANSLATORS: Please leave untranslated, but add an acute accent to the
# "o" in "Gramadoir" if available in your character set and encoding.
# If you must translate, this is the Irish for (literally) "The Grammarian"
CLAR=`gettext "An Gramadoir"`
checking=`gettext "Currently checking %s"`
notfound=`gettext "There is no such file."`
isdir=`gettext "Is a directory"`
permissions=`gettext "Permission denied"`
usage1=`gettext "Usage: %s [OPTIONS] [FILES]"`
usage2=`gettext "Options for end-users:"`

# TRANSLATORS: ~/.neamhshuim is an "ignore file" like those with spellcheckers
userop1=`gettext "    --iomlan       report all errors (i.e. do not use ~/.neamhshuim)"`
userop2=`gettext "    --ionchod=ENC  specify the character encoding of the text to be checked"`
userop3=`gettext "    --litriu       write misspelled words to standard output"`
userop4=`gettext "    --aspell       suggest corrections for misspellings (requires GNU aspell)"`
userop5=`gettext "    --teanga=XX    specify the language of the text to be checked (default=ga)"`
userop6=`gettext "    --help         display this help and exit"`
userop7=`gettext "    --version      output version information and exit"`
usage3=`gettext "Options for developers:"`

# TRANSLATORS: "disambiguation" is a term from natural language processing.
# Here it means the process of deciding, for a given word with several
# possible parts of speech (noun, verb, etc.), the correct part of speech
# given the context.  Brill's algorithm is "unsupervised" because it relies
# only on statistics gathered from texts which have not been tagged in advance.
devop1=`gettext "    --brill        find disambiguation rules via Brill's unsupervised algorithm"`
devop2=`gettext "    --html         produce HTML output for viewing in a web browser"`
devop3=`gettext "    --ilchiall     report unresolved ambiguities, sorted by frequency"`
devop4=`gettext "    --minic        output all tags, sorted by frequency (for unigram-xx.txt)"`

# TRANSLATORS: By default, if there is no rule in the disambiguation module
# for selecting the correct part of speech of an ambiguous word, the program
# chooses the part of speech with the highest frequency.  This is sometimes
# called "unigram" tagging.   The --no-unigram option turns this behavior off.
devop5=`gettext "    --no-unigram   do not resolve ambiguous parts of speech by frequency"`

# TRANSLATORS: The grammar checker works by piping the input text
# through a sequence of filters which add XML markup indicating
# important grammatical information.  The --xml option displays the
# marked up file as an aid in debugging.
devop6=`gettext "    --xml          write tagged XML stream to standard output, for debugging"`
stdinhelp=`gettext "If no file is given, read from standard input."`
bugreports=`gettext "Send bug reports to <%s>."`
versionstring=`gettext "version %s"`
gpl=`gettext "This is free software; see the source for copying conditions.  There is NO\nwarranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE,\nto the extent permitted by law."`
tryhelp=`gettext "Try %s for more information."`
unknownopt=`gettext "unrecognized option %s"`
missingarg=`gettext "option %s requires an argument"`
unwanted=`gettext "option %s does not allow an argument"`
unsupported=`gettext "Language %s is not supported."`

# TRANSLATORS: "conversion" here means conversion between character encodings
iconvproblem=`gettext "conversion from %s is not supported"`
aspellproblem=`gettext "aspell-%s is not installed"`
ANAITHNID=`gettext "Unknown word"`

# TRANSLATORS: only two unknown words are reported per sentence
ANAITHNIDSRL=`gettext "Unknown word (ignoring remainder in this sentence)"`
NEAMHCHOIT=`gettext "Valid word but extremely rare in actual usage"`
INPHRASE=`gettext "Usually used in the set phrase \/\1\/"`
BACHOIR=`gettext "You should use \/\1\/ here instead"`
CAIGHDEAN=`gettext "Non-standard form: perhaps use \/\1\/?"`

# TRANSLATORS: "Mutation" refers to either "lenition" or "eclipsis" (see below)
CLAOCHLU=`gettext "Initial mutation missing"`

# TRANSLATORS: "Lenition" is the softening of an initial consonant in Irish.
# It is indicated in writing by the addition of an "h": e.g. "bean" -> "bhean"
NISEIMHIU=`gettext "Unnecessary lenition"`
PREFIXH=`gettext "Prefix \/h\/ missing"`
PREFIXT=`gettext "Prefix \/t\/ missing"`
SEIMHIU=`gettext "Lenition missing"`

# TRANSLATORS: "Eclipsis" is, like lenition, a phonetic change applied to
# initial consonants in Irish.  It is indicated in writing by the addition
# of the eclipsing consonant as a prefix: e.g. "bean" -> "mbean"
URU=`gettext "Eclipsis missing"`
DUBAILTE=`gettext "Repeated word"`
CUPLA=`gettext "Unusual combination of words"`
BREISCHEIM=`gettext "Comparative adjective required"`
NIAITCH=`gettext "Unnecessary prefix \/h\/"`
NITEE=`gettext "Unnecessary prefix \/t\/"`
ONEART=`gettext "Unnecessary use of the definite article"`
GENITIVE=`gettext "The genitive case is required here"`

# save for gettext calls within cuardach...
OLDCTYPE="${LC_CTYPE}"
export LC_CTYPE="C"

# i18n: grammatical errors.  See rialacha-*.in.
# If you port to another language, you won't translate these; instead
# they will be replaced completely with messages corresponding to
# the errors for your language.
loganu()
{
sed "
s/\"ANAITHNIDSRL/\"${ANAITHNIDSRL}/g
s/\"ANAITHNID/\"${ANAITHNID}/g
s/\"NEAMHCHOIT/\"${NEAMHCHOIT}/g
s/\"INPHRASE{\([^}]*\)}/\"${INPHRASE}/g
s/\"BACHOIR{\([^}]*\)}/\"${BACHOIR}/g
/CAIGHDEAN/s/_/ /g
s/\"CAIGHDEAN{\([^}]*\)}/\"${CAIGHDEAN}/g
s/\"CLAOCHLU/\"${CLAOCHLU}/g
s/\"NISEIMHIU/\"${NISEIMHIU}/g
s/\"PREFIXH/\"${PREFIXH}/g
s/\"PREFIXT/\"${PREFIXT}/g
s/\"SEIMHIU/\"${SEIMHIU}/g
s/\"URU/\"${URU}/g
s/\"DUBAILTE/\"${DUBAILTE}/g
s/\"CUPLA/\"${CUPLA}/g
s/\"BREISCHEIM/\"${BREISCHEIM}/g
s/\"NIAITCH/\"${NIAITCH}/g
s/\"NITEE/\"${NITEE}/g
s/\"ONEART/\"${ONEART}/g
s/\"GENITIVE/\"${GENITIVE}/g
"
}

TAB=`gawk 'BEGIN{printf "\t";}'`

# this is what happens if --version is passed
# Do not translate "Copyright" per the GNU coding standards.
versionout()
{
printf "${CLAR}, ${versionstring}\n" "${packageversion}"
echo "Copyright (C) 2003 Kevin P. Scannell"
printf "${gpl}\n"
}

htmlversion()
{
echo "<p>"
printf "<a href=\"http://borel.slu.edu/gramadoir/\">${CLAR}</a>, ${versionstring}<br>\n" "${packageversion}"
echo "Copyright (C) 2003 <a href=\"http://borel.slu.edu/\">Kevin P. Scannell</a><br><br>"
echo "<i>"
printf "${gpl}\n"
echo "</i></p><hr>"
}

# this is what happens if --help is passed
helpout()
{
printf "${usage1}\n" "${EXECUTABLE}"
echo
echo "${usage2}"
echo "${userop1}"
echo "${userop2}"
echo "${userop3}"
echo "${userop4}"
echo "${userop5}"
echo "${userop6}"
echo "${userop7}"
echo
echo "${usage3}"
echo "${devop1}"
echo "${devop2}"
echo "${devop3}"
echo "${devop4}"
echo "${devop5}"
echo "${devop6}"
echo
echo "${stdinhelp}"
echo
printf "${bugreports}\n" "${bugemail}"
}

# one argument, the erroneous option passed
option_error()
{
printf "${CLAR}: ${unknownopt}\n" "${1}"
printf "${tryhelp}\n" "${HELPCOMMAND}"
}

# one argument, the erroneous option passed
no_argument()
{
printf "${CLAR}: ${missingarg}\n" "${1}"
printf "${tryhelp}\n" "${HELPCOMMAND}"
}

# one argument, the erroneous option passed
unwanted_arg()
{
printf "${CLAR}: ${unwanted}\n" "${1}"
printf "${tryhelp}\n" "${HELPCOMMAND}"
}


# This function used to simply convert a utf-8 stream into ISO-8859-1,
# hence the name.  It has now been generalized to use GNU libc "iconv"
# to do all conversions of character encodings.
diutf()
{
if [ "${IONCHOD}" != "${NATIVE}" ]
then
	iconv -f "${IONCHOD}" -t "${NATIVE}"
else
	cat
fi
}

toutf()
{
if [ "${IONCHOD}" != "${NATIVE}" ]
then
	iconv -f "${NATIVE}" -t "${IONCHOD}"
else
	cat
fi
}

# escape punctuation marks which cannot be end-of-sentence.
# Don't even care if there isn't space or quotes after since
# these surely won't be end of sentence either.
escape_punc()
{
sed "
s/\([^${BDCHARS}0-9-][0-9]\)\([.?!]\)/\1\\\\\2/g
s/\([^${BDCHARS}0-9-][0-9][0-9]\)\([.?!]\)/\1\\\\\2/g
s/\(\...\)\([.?!]\)/\1\\\\\2/g
s/\.ie\\\\\([.?!]\)/.ie\1/g
s/\.uk\\\\\([.?!]\)/.uk\1/g
s/\(\..\)\([.?!]\)/\1\\\\\2/g
s/\(\.\)\([.?!]\)/\1\\\\\2/g
s/\([IVX][IVX]\)\([.?!]\)/\1\\\\\2/g
" |
/usr/lib/gramadoir/giorr-${TEANGA} "${BDCHARS}"
}

# strips any pre-existing markup, tokenizes with <c></c>,
# inserts line number markup, puts one sentence per line 
# final filter is there because of some indecision about
# whether I like the line number markups on the interiors of lines
abairti()
{
tr -d "\015" |
grep -E -n '^' |
sed "s/<[^>]*>//g; s/[&\<>]/ /g" |
escape_punc "${BDCHARS}" | 
sed "
s/[${BDCHARS}][${BDCHARS}${INTCHARS}]*/<c>&<\/c>/g;
s/\([${INTCHARS}][${INTCHARS}]*\)<\/c>/<\/c>\1/g;
s/^\([1-9][0-9]*\):/<line uimhir=\"\1\"> /; 
s/\([^\][.?!][]\"\')}]*\) [ ${TAB}\n]*/\1 <\/line>/g;
s/[^\][.?!][]\"\')}]*$/& <\/line>/" |
sed 's/<\/line>\(.\)/<\/line>\
\1/g' |
tr -d '\\' |
athuimhir |
athcheang |
sed 's/\(.\)<line uimhir[^>]*>/\1/g'
}

# helper for abairti.  If a sentence starts in the middle of a line,
# need the line number tag to propagate down to the beginning of this
# sentence.  Thus, the first chunk below stores the line number markup 
# in hold space and the second chunk retrieves it and inserts at beginning
#  n.b. POSIX sed requires no whitespace after the "!" in the negated 
#  match below.    Discovered while porting to an old DEC machine.
athuimhir()
{
sed '
/^<line uimhir=/ {
h
s/^\(<line uimhir=\"[1-9][0-9]*\">\).*$/\1/
x
}

/^<line uimhir=/!{
G
s/^\(.*\)\n\(<line uimhir=\"[1-9][0-9]*\">\)$/\2 \1/
}

/^<line uimhir=\"[1-9][0-9]*\"> *$/d
'
}

# helper for abairti.  This is the "complement" to athuimhir, for when
# a sentence spans across lines.   It needs to be "rejoined" (athcheangailte).
# onto a single line.
athcheang()
{
sed "
/<\/line>$/ {
H
s/^.*$//
x
s/\n//g
s/[ ${TAB}][ ${TAB}]*/ /g
}

/<\/line>$/!{
H
d
}
"
}

comhshuite()
{
/usr/bin/perl /usr/lib/gramadoir/comhshuite-${TEANGA}.pl
}

# This is a filter which takes the markup generated by 
# looking up words in the dictionary and tries to resolve
# any ambiguties by looking at local context.
#
#  It is a sequence of sed replacements which are discovered
#  inductively by looking at examples of ambiguities in real
#  Irish texts.  It is important to order the replacements
#  according to their "confidence level"; while it is clear
#  that...     distinguishing between ...  is sometimes much harder.
aonchiall()
{
/usr/bin/perl /usr/lib/gramadoir/aonchiall-${TEANGA}.pl
}

unigram()
{
/usr/bin/perl /usr/lib/gramadoir/unigram-${TEANGA}.pl
}

# This is the workhorse.  It does the actually marking up of errors.
rialacha()
{
/usr/bin/perl /usr/lib/gramadoir/rialacha-${TEANGA}.pl
}

# This strips off errors from exceptions to rules
eisceacht()
{
/usr/bin/perl /usr/lib/gramadoir/eisceacht-${TEANGA}.pl
}

aspell_search()
{
grep -E -o '><X>[^<]*' | sort -u | aspell -a -d ${TEANGA} | grep -E '^&' | 
sed '
s/^& \([^ ]*\).*:/\1/
s/, /%/5
s/%.*//
s/^\([^ ]*\) \(.*\)/s\/ANAITHNID"><X>\1<\/ANAITHNID \\\/\2\\\/?"><X>\1<\/\
s\/ANAITHNIDSRL"><X>\1<\/ANAITHNIDSRL \\\/\2\\\/?"><X>\1<\//
'
}

aspell_xml_output()
{
TEMPXML=`mktemp` || TEMPXML="/tmp/aspell_repl.xml"
TEMPSED=`mktemp` || TEMPSED="/tmp/aspell_repl.sed"
		unchecked_xml |
		comhshuite |
		aonchiall | aonchiall |
		unigram |
		rialacha |
		eisceacht | tee "${TEMPXML}" |
		aspell_search > "${TEMPSED}"
		cat "${TEMPXML}" | 
		sed -f "${TEMPSED}"
rm -f "${TEMPXML}" "${TEMPSED}"
}

# This is the final filter used by the "gr" program.
# It generates a Bourne shell script which produces the
# final output.  It is the part I like the least and 
# users should feel free to replace it with their own
# front ends.
comheadan()
{
sed -n '
/<E/ {
h
s/^<line uimhir=\"\([1-9][0-9]*\)\">/echo "\1:/
s/<[^>]*>//g
s/\(["$`]\)/\\\1/g
s/^echo \\"/echo "/
s/$/" | myaibhsigh"(/
G
x
s/<[^E][^>]*>//g
s/^[^<]*</</
s/>[^<]*</></g
s/>[^<]*$/>/
s/<E[^>]*msg=\"\([^"]*\)\"[^>]*>/"\1," /g
s/^/echo /
s/,\" $/."; echo/
x
s/<[^E\/][^>]*>//g
s/<\/[^E][^>]*>//g
s/\n[^<]*</</
s/>[^<]*$/>/
s/<\/E>[^<]*<E/<\/E><E/g
s/<E[^>]*>\([^<]*\)<\/E>/\1|/g
s/<[^>]*>//g
s/|$/)"/
G
p
}
'
}

# does the error messaging and returns the boolean 
drochchomhad()
{
	if [ ! -e "${1}" ]
	then
		echo "${CLAR}: ${1}: ${notfound}" >&2
	else
		if [ -d "${1}" ]
		then
			echo "${CLAR}: ${1}: ${isdir}" >&2
		else
			if [ ! -r "${1}" ]
			then
				echo "${CLAR}: ${1}: ${permissions}" >&2
			fi
		fi
	fi
	[ ! -e "${1}" ] || [ -d "${1}" ] || [ ! -r "${1}" ]    # return value
}

USEIGNORE="ignore"

unchecked_xml()
{
		diutf | 
		abairti |
		LC_CTYPE="${OLDCTYPE}" /usr/lib/gramadoir/cuardach "${USEIGNORE}" "${CLAR}" "${TEANGA}" |
		sed "/^<.xml/s/ISO-8859-1/${NATIVE}/"
}

vanilla_xml_output()
{
		unchecked_xml |
		comhshuite |
		aonchiall | aonchiall |
		unigram |
		rialacha |
		eisceacht
}

nounigram_output()
{
		unchecked_xml |
		comhshuite |
		aonchiall | aonchiall
}

ambiguity_report()
{
TEMPXML=`mktemp` || TEMPXML="/tmp/amb_rep.xml"
nounigram_output | 
tee ${TEMPXML} |
sed 's/<B>/\
<B>/g' |
grep -E '^<B>' | 
sed 's/^<B><Z>\([^Z]*\)<\/Z>.*/\1/' | 
sort | uniq -c | sort -r -n |
while read ambig
do
	echo "${ambig}"
	JUSTTAGS=`echo "${ambig}" | sed 's/^[^<]*//'`
	grep -E -e "${JUSTTAGS}" "${TEMPXML}" | sed "s@<B><Z>${JUSTTAGS}@\\\\&@g" | sed 's/\\<B>/\
<B>/g' | grep -E -e "^<B><Z>${JUSTTAGS}</Z>" | sed "s@^<B><Z>${JUSTTAGS}<\/Z>\([^<]*\)<\/B>.*@\1@" | sort | uniq -c | sort -r -n | head -n 15 | sed "s/^/       /"
done
rm -f "${TEMPXML}"
}

create_unigram_file()
{
nounigram_output |
unigram |
grep -o "<[ACDF-W][^>/]*>" |
sort | uniq -c | sort -r -n |
sed 's/^[^<]*//'
}

# This is the main sequence of filters for checking a given file
comhad()
{
		"${XML_OUTPUT}" |
		comheadan |
		gawk "{sub(/myaibhsigh/,\"${AIBHSIGHVAR}\"); print}" |
		toutf |
		loganu |
		/bin/bash -s
}

litriu_comheadan()
{
sed -n '
/<X>/{
s/<[^X\/][^>]*>//g
s/<\/[^X][^>]*>//g
s/^[^<]*<X>//
s/<\/X>[^<]*$//
s/<\/X>[^<]*<X>/\
/g
p
}
'
}

# This replaces "comhad" above when "--litriu" option is given
litriu()
{
	unchecked_xml | litriu_comheadan | toutf
}

# used by Brill
process_two_best()
{
WHOLE=`cat`
COUNT1=`echo ${WHOLE} | sed 's/^\([0-9][0-9]*\).*/\1/'`
FREQ1=`echo ${WHOLE} | sed 's/^[0-9][0-9]* \([0-9][0-9]*\).*/\1/'`
TAG1=`echo ${WHOLE} | sed 's/^[0-9][0-9]* [0-9][0-9]* \(<[^>]*>\).*/\1/'`
COUNT2=`echo ${WHOLE} | sed 's/^[^@]*@\([0-9][0-9]*\).*/\1/'`
ANSWER=`echo "${COUNT1} ${COUNT2} - ${FREQ1} * p" | dc`
echo "${ANSWER} ${TAG1}"
}

# Computes scores for Brill algorithm
# Takes one argument (the temp file containing ambigs in context)
best_tag_in_context()
{
UNAMBIGFILE="/tmp/unambigs"
(echo "0 0 <NOTHING>"; echo "0 0 <NOTHING>";
sort | uniq -c |
while read line
do
	TAG=`echo "${line}" | sed 's/^ *[1-9][0-9]*.//'`
	INCONTEXT=`echo "${line}" | sed 's/^ *\([1-9][0-9]*\).*/\1/'`
	FREQ=`grep -E "${TAG}" ./tagfreqs.txt | sed 's/^ *\([1-9][0-9]*\).*/\1/'`
	PPMINCONTEXT=`echo "${INCONTEXT} 1000000 * ${FREQ} / p" | dc`
	echo "${PPMINCONTEXT} ${FREQ} ${TAG}"
done) > "${UNAMBIGFILE}"
cat "${1}" | sed 's/^<Z>//' | sed 's/<\/Z>//' |
while read ambiguity
do
	echo "${ambiguity}<NOTHING>" | grep -o '<[A-Z][^>]*>' | sed 's/\/>/>/' |
	while read individual
	do
		grep -E -e "${individual}" "${UNAMBIGFILE}"
	done | sort -r -n | head -n 2 | tr "\n" "@" | process_two_best |
	sed "s%^\([0-9]*\) \(.*\)%\1 <B><Z>${ambiguity}</Z>ANYTHING</B>:\2%"
done
rm -f "${UNAMBIGFILE}"
}

# 1st arg=contextual marker (chunk of XML amounting to a single marked up word)
#   assuming one set of escaped parentheses in it!
# 2nd arg, like 1st arg, but used in output to putative "aonchiall-xx.in"
# 3rd arg=(subset of) XML file
#  does both the "before" and "after" contexts 
compute_given_context()
{
AMBIGFILE="/tmp/ambigs"
grep -o "${1} <B><Z>\(<[A-Z][^>]*/>\)*</Z>" "${3}" |
sed "s@^${1} <B>\(<Z>\(<[A-Z][^>]*>\)*</Z>\)@\2@" |
sort -u > "${AMBIGFILE}"
grep -o "${1} <[ACDF-W][^>]*>" "${3}" |
sed "s@^${1} \(<[ACDF-W][^>]*>\)@\2@" |
best_tag_in_context "${AMBIGFILE}" |
sed "s@^\([0-9]*\) @\1 ${2} @"

grep -o "<B><Z>(<[A-Z][^>]*/>)+</Z>[^<]*<\/B> ${1}" "${3}" |
sed "s@^<B>\(<Z>\(<[A-Z][^>]*>\)*</Z>\).*@\1@" |
sort -u > "${AMBIGFILE}" 
grep -o "<[ACDF-W][^>]*>[^<]*<\/[A-Z]> ${1}" "${3}" |
sed "s@^\(<[ACDF-W][^>]*>\).*@\1@" |
best_tag_in_context "${AMBIGFILE}" |
sed "s@:<[^>]*>@ ${2}&@"
rm -f "${AMBIGFILE}"
}

brill_unsupervised_learning()
{
BIGXML="./brill.xml"
BIGXMLSUBSET="/tmp/tobrillsubset.xml"
(nounigram_output |
tee "${BIGXML}" |
grep -E -o "<[ACDF-W][^>/]*>" |
sort | uniq -c | tee ./tagfreqs.txt |
sed 's/^ *[0-9]*.//' |
while read tag
do
	grep -E "${tag}" "${BIGXML}" > "${BIGXMLSUBSET}"
	compute_given_context "`echo "${tag}" | sed 's/<\([A-Z]\)[^>]*>/&[^<]*<\/\\\(\1\\\)>/'`" "`echo "${tag}" | sed 's/<\([A-Z]\)[^>]*>/&ANYTHING<\/\1>/'`" "${BIGXMLSUBSET}"
done;
cat "${BIGXML}" |
grep -E -o ">[^< ]+<" |
sort | uniq -c | sort -r -n |
sed 's/^ *[0-9]*.//' |
head -n 50 |
while read commonword
do
	grep -E "${commonword}" "${BIGXML}" > "${BIGXMLSUBSET}"
	compute_given_context "`echo "${commonword}" | sed 's/>\(.*\)/\\\\(<[^>]*>\\\\)*\1\/[A-Z]>/'`" "`echo "${commonword}" | sed 's/>\([^<]*\)</\1/'`" "${BIGXMLSUBSET}"
done) | sort -r -n | grep -E -v '^0 ' | sed 's/^[0-9]* //'
rm -f "${BIGXMLSUBSET}"
}

# remainder is the "main()"

AIBHSIGHVAR="grep -E --color -h "
ACTION="comhad"
SCRIOBH_HTML="nihea"
TEANGA="ga"
IONCHOD="unspecified"
XML_OUTPUT="vanilla_xml_output"
while [ ${1%%[^-]*} ]
do
	if echo "${1}" | grep "=" > /dev/null
	then
		JUSTOPT=`echo "${1}" | sed 's/=.*//'`
		OPTARG=`echo "${1}" | sed 's/^[^=]*=//'`
		case "${JUSTOPT}" in
		"--teanga" )
			if echo "${TEANGACHA}" | grep "${OPTARG}" > /dev/null
			then
				TEANGA="${OPTARG}"
			else
				printf "${CLAR}: ${unsupported}\n" "${OPTARG}" >&2
				exit 1
			fi
		;;
		"--ionchod" )
			IONCHOD="${OPTARG}"
		;;
		"--help" | "--version" | "--brill" | "--html" | "--ilchiall" | "--minic" | "--no-unigram" | "--xml" | "--litriu" | "--aspell" | "--iomlan" )
			unwanted_arg "${JUSTOPT}" >&2
			exit 1
		;;
		* )
			option_error "${JUSTOPT}" >&2
			exit 1
		;;
		esac
	else
		case "${1}" in
		"--version" )
			if [ "${SCRIOBH_HTML}" = "issea" ]
			then
				htmlversion
			else
				versionout
			fi
			exit 0
		;;
		"--help" )
			helpout
			exit 0
		;;
		"--html" )
			AIBHSIGHVAR="gawk -f /usr/lib/gramadoir/hilite.awk pattern="
			SCRIOBH_HTML="issea"
		;;
		"--litriu" )
			ACTION="litriu"
		;;
		"--aspell" )
			XML_OUTPUT="aspell_xml_output"
		;;
		"--xml" )
			ACTION="xml_output"
		;;
		"--brill" )
			ACTION="brill_unsupervised_learning"
		;;
		"--ilchiall" )
			ACTION="ambiguity_report"
		;;
		"--minic" )
			ACTION="create_unigram_file"
		;;
		"--no-unigram" )
			ACTION="nounigram_output"
		;;
		"--iomlan" )
			USEIGNORE="noignore"
		;;
		"--ionchod" | "--teanga" )
			no_argument "${1}" >&2
			exit 1
		;;
		* )
			option_error "${1}" >&2
			exit 1
		;;
		esac
	fi
	shift
done

NATIVE=`grep "^${TEANGA} " ${DATADIR}/TEANGACHA | sed "s/^${TEANGA}  *\([^ ]*\).*/\1/"`
CHARSPEC=`grep "^${TEANGA} " ${DATADIR}/TEANGACHA | sed "s/^${TEANGA}  *[^ ]*  *//"`
BDCHARS=`echo "${CHARSPEC}" | sed "s/ .*//"`
INTCHARS=`echo "${CHARSPEC}" | sed "s/^[^ ]*  *//"`
if [ "${IONCHOD}" = "unspecified" ]
then
	IONCHOD="${NATIVE}"
fi
if [ "${IONCHOD}" != "${NATIVE}" ]
then
	if ! echo | iconv -f "${IONCHOD}" -t "${NATIVE}" > /dev/null 2>&1
	then
		printf "${CLAR}: ${iconvproblem}\n" "${IONCHOD}" >&2
		exit 1
	fi
fi
if [ "${XML_OUTPUT}" = "aspell_xml_output" ]
then
	if ! echo "hellooo" | aspell -a -d "${TEANGA}" > /dev/null 2>&1
	then
		printf "${CLAR}: ${aspellproblem}\n" "${TEANGA}" >&2
		XML_OUTPUT="vanilla_xml_output"
	fi
fi
if [ "${ACTION}" = "xml_output" ]
then
	ACTION="${XML_OUTPUT}"
fi

if [ -z "${1}" ]
then
	${ACTION}
else
	until [ -z "${1}" ]
	do
		if ! drochchomhad "${1}"
		then
			if [ "${ACTION}" = "comhad" ]
			then
				printf "${checking}\n" "${1}"
			fi
			cat "${1}" | ${ACTION}
		fi	
		shift
	done
fi
exit 0
