#! /usr/bin/env python3

##############################################
# Extract symbol names from a .ind file.     #
# Author: Scott Pakin <scott-clsl@pakin.org> #
##############################################

import re
import sys
from collections import defaultdict


# Define regular expressions to pick out a symbol name (as invoked from a
# document) and implementation (which may be specific to the CLSL).
name_re = re.compile(r'\\(?:sub)?item (.+?)\s+\(')
impl_re = re.compile(r'\s+\(\$?([^\)$]+)\$?\)\\pfill')

# Define a regular expression to replace \spverb+ANYTHING+ with ANYTHING.
verb_re = re.compile(r'\\(?:sp)?verb\+([^+]+)\+')

# Define a regular expression to replace \blackacc{ANYTHING} (and similar
# constructs) with ANYTHING.
blackacc_re = re.compile(r'\\blackacc(?:hack|two)?\{([^\}]+)\}')


def strip_wrappers(s):
    'Remove uninteresting wrapper calls around an implementation string.'
    # Handle both "\blackacc{\xyz}" and "\blackacc\xyz" formulations.
    # Remove "\ensuremath{...}".  Remove various special-case expressions
    # that appear in the index.
    wrappers = [
        (r'\blackacchack{', '}'),
        (r'\blackacctwo{',  '}'),
        (r'\blackacc{',     '}'),
        (r'\blackacchack',  ''),
        (r'\blackacctwo',   ''),
        (r'\blackacc',      ''),
        (r'\ensuremath{',   '}'),
        (r'\mbox',          '}'),
        (r'\strut',         ''),
        (r'\smash',         ''),
        (r'',               r'\relax'),
        (r'',               r'\hspace{0.5em}'),
    ]
    changed = True
    while changed:
        changed = False
        for w0, w1 in wrappers:
            if s.startswith(w0) and s.endswith(w1):
                s = s[len(w0):len(s) - len(w1)]
                s = s.strip()
                changed = True
    return s


def extract_name(s):
    'Extract a control sequence (with optional arguments) from a string.'
    if s.startswith(r'\href{'):
        return None
    s = verb_re.sub(r'\1', s)
    s = s.replace(r'\-', '')
    s = s.replace(r'\linebreak[0]', '')
    s = s.strip()
    if s.startswith('\\') or len(s) == 1:
        return s
    return None


def contains_stray_spaces(s):
    'Return True if a string contains spaces outside of curly braces.'
    depth = 0
    for i, c in enumerate(s):
        if c == ' ' and depth == 0:
            return True
        elif c == '{':
            depth += 1
        elif c == '}':
            depth -= 1
    return False


def clean_dict(i2n):
    '''Replace names of None with a copy of the implementation.  Filter out
    non-control-sequences.'''
    clean = {}
    for k, v in i2n.items():
        if v is None:
            v = k
        if '\\' not in k and '\\' not in v:
            continue
        if v.startswith(r'\\texttt{') and v.endswith('}'):
            v = v[9:-1]   # Special case for "|"
        clean[k] = v
    return clean


def impl_sort_key(k):
    'Return a key for sorting impl_to_name.'
    global impl_to_name
    v = impl_to_name[k] or k
    return (v.lower(), v, k.lower(), k)


###########################################################################

# Process the input file line-by-line.
impl_to_name = defaultdict(lambda: None)
with open(sys.argv[1]) as r:
    for ln in r:
        # Find the symbol implementation.  Continue with the next line if
        # no implementation is found or the implementation contains a space
        # (as in, e.g., "xyz package option").
        match = impl_re.search(ln)
        if match is None:
            continue
        impl = strip_wrappers(match[1])
        if r'\,' in impl or contains_stray_spaces(impl):
            continue

        # Find the symbol name.  Use None if there is no name or the name
        # is not a control sequence.
        name = None
        match = name_re.search(ln)
        if match is not None:
            name = extract_name(match[1])

        # Store the mapping.  Don't replace non-None with None, though.
        if impl_to_name[impl] is None:
            impl_to_name[impl] = name


# Sort and output the list of symbols.
impl_to_name = clean_dict(impl_to_name)
max_name_len = max([len(v) for k, v in impl_to_name.items()])
impls = sorted(impl_to_name, key=impl_sort_key)
for k in impls:
    print('%-*.*s %s' % (max_name_len, max_name_len, impl_to_name[k], k))
