package edu.vt.marian.Document;

import java.io.*;
import java.net.*;
import java.util.*;

import edu.vt.marian.common.*;


/**
    A mapping between (e.g., ANSEL) characters and (e.g., OAI XML) entities.
    @author	Robert France
*/

public class EntityMap
{
    /**
     *  A mapping for a combining diacritic character, including mappings where
     *      defined from [diacrit+modified character] to single combined entity.
     *
     *  <B>NOTE:</B>  In XML combining diacritics follow the modified character;
     *      in MARC/ANSEL they precede it.  Thus when moving from XML to ANSEL
     *      some lookahead is necessary.
     */
    private class DiacriticMap
    {
        String [] entityStr;
        String defaultEntity;

        public DiacriticMap(String defEnt)
        {
            defaultEntity = new String("&" + defEnt + ";");
            // Entity strings for modified ASCII characters, where such defined.
            entityStr = new String [128];
            for (int i=0; i<128; i++)
                entityStr[i] = null;	// Begin with all undefined.
        }

        public void addMap(int charValue, String str)
        {
            entityStr[charValue] = new String("&" + str + ";");
        }

        public boolean map(char c, BufferedWriter out) throws IOException
        {
            if ( (c < 0) || (c > 127) ||     // (Such craziness does happen.)
                 ( entityStr[c] == null ) )  // No combined character defined.
            {
                out.write(c);
                out.write(defaultEntity);
                return( false );
            }
            else
            {
                out.write(entityStr[c]);
                return( true );
            }
        }

        public int hashCode()
        {
         return( defaultEntity.hashCode() );
        }
    }


    Debug debug;

    /**
     *  Any entity longer than this has got to be a mistake.
     */
    private static int MAX_ENTITY_LENGTH = 30;

    /**
    Mapping from non-ASCII characters to either entity strings or diacritic
        maps.
    When the data in charTable is a String, it is "ready to wear" with ampersand
        and semicolon already in place.  See note to addEntity().
    */
    private Hashtable charTable;

    /**
    Mapping from entity strings to non-ASCII character (sequences).
    */
    private Hashtable entityTable;


    /**
     *  Find the Object corresonding to this entity string.  If this is a
     *      symbolic entity name, use the String form.  If on the other hand
     *      it is a numeric entity reference (presumably to a UNICODE character)
     *      convert it to an Integer.
     *
     *  @param	entityStr	The undelineated entity:  i.e., the sequence
     *        		of characters strictly between the '&' and ';' characters.
     */
    private Object resolveEntityStr(String entityStr)
    {
        if (entityStr.charAt(0) == '#' )	// Numeric entity reference.
        {
            Integer entity;
			try {
                if ( entityStr.charAt(1) == 'x' )	// Allow both hex and
                {					//  decimal values.
                    String t1 = entityStr.substring(2);
                    entity = new Integer(Integer.parseInt(t1, 16));
                }
                else
			    {
                    String t1 = entityStr.substring(1);
                    entity = new Integer(Integer.parseInt(t1));
			    }
                return( entity );
			} catch (NumberFormatException e)
			{
				debug.dumpTrace("EntityMap.resolveEntityString:  entity '" + entityStr +
								"' begins with # but is not a well-formed number.  " +
								"Treating as string.");
				return( entityStr );
			}
        }
        else
            return( entityStr );

    }


    /**
     *  Add a simple two-way mapping between a single (non-ASCII) character and
     *      an entity string to the map function.
     *
     *  <B>NOTE:</B>  For numeric entity references, we take the representation
     *      of the numeric value that is being added here to be canonical.  This
     *      means that in a round-trip translation XMl->ANSEL->XML, for
     *      instance,  an entity reference like &#233; of &#xe9; will end up
     *      as &#x00E9; if the latter is the way that the reference occurs in
     *      the loading file (which is the case with "ansdel_uni_comb.map").
     *      What this gains us is the confidence to assume that the objects
     *      in charTable can always be assumed to be Strings.
     */
    private void addEntity(char charVal, String entityStr)
    {
        charTable.put(new Character(charVal), new String("&"+entityStr+";") );

        char [] charArray = new char [1];
        charArray[0] = charVal;

        entityTable.put(resolveEntityStr(entityStr), new String(charArray));
    }


    /**
     *  Is this a combining diacritic, rather than either an unmodified or a
     *      composite character?
     */
    private boolean isDiacritic(Object inverseMapping)
    {
        if (! (inverseMapping instanceof String) ||	    // Sanity clause.
              ( ((String) inverseMapping).length() != 1) )
            return( false );

        char c = ((String) inverseMapping).charAt(0);
        Object mapping = charTable.get(new Character(c));
        if ( mapping == null )
            return( false );
        else
            return( mapping instanceof DiacriticMap );
    }


    /**
     *  @param	offset	the offset in baseStr of the leading '&' of the entity.
     */
    private Object findEntityMapping(String baseStr, int offset)
    {
        int j = baseStr.indexOf(';', offset+1);
        if (j == -1)	// No terminating semicolon!
        {
            debug.dumpTrace("EntityMap.findEntityMapping(): no terminating ';' for '&' at offset '" +
                            offset + " in '" + baseStr + "': writing as '&'.");
            return( null );
        }
        Object entity = resolveEntityStr( baseStr.substring(offset+1, j) );

        Object mapping;
        if ( (mapping = entityTable.get(entity)) == null )
        {
            debug.dumpTrace("EntityMap.findEntityMapping():  unknown entity '" +
                            baseStr.substring(offset+1, j) + "': writing as read.");
            return( null );
        }

        return( mapping );
    }


    public EntityMap(Debug dbg)
    {
        debug = dbg;
        charTable = new Hashtable(256);	// The absolute maximum size for 8-bit characters.
        entityTable = new Hashtable(512);	// Just a good guess.
        addEntity('&', "#x0026");	// Two values that are needed in any SGML, HTML
        addEntity('<', "#x003C");	//  or XML DTD, and that seem to be pretty standard.
    }


    /**
     *  Load from a file.
     *
     *  <B>FORMAT:</B>  
     */
    public void load(BufferedReader in) throws IOException
    {
        String line = in.readLine();
        if ( line == null )
        {
            debug.dumpTrace("EntityMap.load():  empty file?!?!.");
            return;
        }
        StringTokenizer st;
        String token;
        int charVal;
        String type;

        try { while( line != null )
        {
            st = new StringTokenizer(line);
            switch( line.charAt(0) )
            {
             case '#':	// Comment:  ignore.
                if ( (line = in.readLine()) == null)
                    throw new EOFException();
                break;
             case '\t':
                debug.dumpTrace("EntityMap.load():  indented line '" +
                              line + "' not part of diacritic set: ignoring.");
                if ( (line = in.readLine()) == null)
                    throw new EOFException();
                break;
             default:
                try
                {
                    token = st.nextToken();
                    if ( token.startsWith("x") )	//**NOTE:  allow both
                    {					//  hex and decimal
                        String t1 = token.substring(1);	//  values.
                        charVal = Integer.parseInt(t1, 16);
                    }
                    else if ( token.startsWith("0x") )
                    {
                        String t1 = token.substring(2);
                        charVal = Integer.parseInt(t1, 16);
                    }
                    else
                    	charVal = Integer.parseInt(token);
                    type = st.nextToken();
                    token = st.nextToken();   
                    if ( type.equals("e") )
                    {
                        addEntity((char) charVal, token);
                        if ( (line = in.readLine()) == null)
                            throw new EOFException();
                    }
                    else if ( type.equals("d") )
                    {
                        // 'token' is combining diacritic: load into entityTable
			//  for non-composite case, then build DiacriticMap
			//  (including composites) for charTable.
                        char [] defaultCharArray = new char [1];
                        defaultCharArray[0] = (char) charVal;
                        entityTable.put(resolveEntityStr(token), new String(defaultCharArray));

                        // Build and load values into DiacriticMap.
                        DiacriticMap dMap = new DiacriticMap(token);
                        int modCharVal;
                        char [] charArray = new char [2];
                        charArray[0] = defaultCharArray[0];
                        if ( (line = in.readLine()) != null)
                            while ( line.charAt(0) == '\t' )
                            {
                                st = new StringTokenizer(line);
                                token = st.nextToken();
                                if ( token.startsWith("x") )
                                {
                                    String t1 = token.substring(1);
                                    modCharVal = Integer.parseInt(t1, 16);
                                }
                                else if ( token.startsWith("0x") )
                                {
                                    String t1 = token.substring(2);
                                    modCharVal = Integer.parseInt(t1, 16);
                                }
                                else
                    	            modCharVal = Integer.parseInt(token);
                                token = st.nextToken();

                                dMap.addMap(modCharVal, token);

                                charArray[1] = (char) modCharVal;
                                entityTable.put(resolveEntityStr(token),
                                                new String(charArray));

                                if ( (line = in.readLine()) == null)
                                    break;
                            }
                        charTable.put(new Character((char) charVal), dMap);
                    }
                    else
                        debug.dumpTrace("EntityMap.load(): unknown type in '" +
                              line + "': ignoring.");
                } catch( NumberFormatException e )
                {
                    debug.dumpTrace("EntityMap.load():  cannot understand '" +
                             line + "': ignoring.");
                    if ( (line = in.readLine()) == null)
                        throw new EOFException();
                }
            }
        } } catch ( EOFException e ) {}
    }

	
	/**
	 * Take a string in a character encoding (e.g., extended ASCII) and convert it to 
	 *	an entity-based encoding (e.g., XML entity references).
	 *	@param	str	The string to be converted.
	 *	@param	out	A BufferedWriter into which to put the new string.
	 */
    public void mapStringToEntities(String str, BufferedWriter out) throws IOException
    {
        Object mapping;
        int i;
        for (i=0; i<str.length(); i++)
        {
            if ( (mapping = charTable.get(new Character(str.charAt(i)))) == null )
            {
                out.write(str.charAt(i));
            }
            else if ( mapping instanceof String )
            {
                out.write((String) mapping);
            }
            else if ( mapping instanceof DiacriticMap )
            {
               	if ( i+1 >= str.length() )
                    out.write( ((DiacriticMap) mapping).defaultEntity);
                else
                    ((DiacriticMap) mapping).map(str.charAt(++i), out);
            }
            else
                debug.dumpTrace("EntityMap.mapStringFromXml():  impossible error: unrecognized object in mapping table.");
        }
    }


    /**
     * Take a string in an entity-based encoding (e.g., XML entity references) and convert it to 
     *    a character-based encoding (e.g., extended ASCII).
     *  @param	str	The string to be converted.
     *  @param	out	A BufferedWriter into which to put the new string.
     */
    public void mapStringFromEntities(String str, BufferedWriter out) throws IOException
    {
        Object mapping;
        String entityStr;
        int i, j;
        char c = 0;
        for (i=0; i<str.length(); i++)
        {
            c = str.charAt(i);
            if ( c == '&' )	// Entity at beginning of string or immediately
            {			//  following another entity:  just write it.
                if ( (mapping = findEntityMapping(str, i)) == null )
                {
                    out.write( c );	// Unterminated or unknown entity: do
                }			//  what you can.
                else	// str starts with a known entity mappping.  It should
                {	//  never be a diacritic, but write it no matter what.
                    out.write( (String) mapping);
                    i = str.indexOf(';', i+1);
                }
            }
            else if ( ( i+1 >= str.length() ) || ( str.charAt(i+1) != '&' ) )
            {
                out.write( c );		// Unmodified character.
            }
            else	// (c != '&') && (next character == '&'):  c may be a
            {		//  modified character.
                if ( (mapping = findEntityMapping(str, i+1)) == null )
                {
                    out.write( c );	// And in following iterations,
                }			//  remainder of entity string.
		// Else (mapping!=null), whence there is a ';' somewhere in str.
                else if ( isDiacritic(mapping) )	// Reverse order between
                {					//  XML & ANSEL.
                    out.write((String) mapping);
                    out.write( c );
                    i = str.indexOf(';', i+1);
                }
                else
                {
                    out.write( c );
                    out.write((String) mapping);
                    i = str.indexOf(';', i+1);
                }
            }
        }
        if ( i < str.length() )	// Last character is not part of an entity,
            out.write( c );	//  and thus still needs to be written.
    }

    /**
     *  Take an stream in the entity-based encoding and create from it a string
     *      in the character-based encoding, stopping at the next &lt; character
     *      (and therefore presumably the next tag) or at end-of-stream.
     *  @throws	EOFException	only when EOF encountered in the middle of an entity.
     */
    public String getStringFromEntityReader(BufferedReader in) throws IOException
    {
        StringBuffer inBuf = new StringBuffer();
        char [] entityChars = new char [MAX_ENTITY_LENGTH];
        Object mapping;
        int i = 0, j;
        int c;		// Incoming character, read as int then cast.
        while ( true )
        {
            c = in.read();
            if ( c == -1 )	// EOF after character or completed entity:  OK.
            	return( new String(inBuf) );

            switch ((char) c )
            {
             case '&':
                entityChars[0] = '&';	// Prepare for "unknown entity" case.
                i = 1;
                while (c != ';')
                {
                    c = in.read();
                    if ( c == -1 )	// EOF
                        throw new EOFException("EOF within entity");
                    if ( c == '&' )	// The last '&' was unterminated. Append
                    {			//  what we've got and start over.
                        inBuf.append(entityChars, 0, i);
                        i = 1;
                    }
                    else if ( c == '<' )	// The last '&' was unterminated
                    {				//  and the test is done. Append
                        inBuf.append(entityChars, 0, i);	// what we've
            	        return( new String(inBuf) );		//  got & leave.
                    }
                    else if ( i >= MAX_ENTITY_LENGTH )	// Something screwy.
                    {					//  Append what we've
                        inBuf.append(entityChars, 0, i);	//  got and get
                        inBuf.append((char) c);		//  back to business.
                        i = 0;
                        break;
                    }
                    else
                        entityChars[i++] = (char) c;
            	}
                if ( i == 0 )	// Abnormal end while collecting entity.  Go
                   break;	//  back to running main loop.

                String entityStr = new String(entityChars, 0, i);

                if ( (mapping = findEntityMapping(entityStr, 0)) == null )
                {
                    // The ampersand, entity string and semicolon are already
            	    inBuf.append(entityChars, 0, i);	//  in entityChars.
                }
                else if ( isDiacritic(mapping) )	// Reverse order between
                {					//  XML & ANSEL.
                    char mod = inBuf.charAt(inBuf.length()-1);
                    inBuf.setLength(inBuf.length()-1);
                    inBuf.append((String) mapping);
                    inBuf.append( mod );
                }
                else
                {
                    inBuf.append((String) mapping);
                }
                break;

             case '<':
                return( new String(inBuf) );

             default:
                inBuf.append((char) c);
                break;
            }
        }
    }
}

