package edu.vt.marian.Document;

import java.io.*;
import java.net.*;
import java.util.*;

import edu.vt.marian.common.*;


/**
	SgmlDocument
	<P>class description: this class represents an NLM SGML document in the system.
	<P>designer(s): Jianxin Zhao (jxzhao@csgrad.cs.vt.edu)
	<P>implementator(s): Jianxin Zhao (jxzhao@csgrad.cs.vt.edu), Robert France
	<P>finished time: 
	<P>known bugs: 
	<P>JDK version: 1.1.5
	<P>side effects:

*/

public class SgmlDocument implements Document
{
	/** this string contains all the information of this document
		the format of the string is in SGML
	*/
	private String sgmlString = null;

	/** this vector contains formatted data from the sgml string
	*/
	private Vector fields = null;

	/** this flag tells whether or not the sgml string has been extracted to
		the vector fields
	*/
	private boolean extracted = false;

	/** this string array maintains the mapping between tags and fields along
		with the seperator used to concat strings whne there are multiple 
		tags with the same name
	*/
	private final static String[][] tagToField = {
		{"ab", "description", ""},  // format -- tag name, field name, seperator	
		{"ac", "acronym", ""},	
		{"ad", "address", ""},	
		{"eml", "email", ""},	
		{"fx", "cross reference", ""},	
		{"gn", "general notes", ""},	
		{"ho", "holdings", ""},	
		{"ic", "contact person", ""},	
		{"kw", "keyword", ""},	
		{"lun", "limitations on use", ""},	
		{"mh", "mesh heading", ""},	
		{"na", "name", ""},	
		{"nt", "type of orgnization", ""},	
		{"pb", "publications", ""},	
		{"rg", "region", ""},	
		{"sa", "sponsoring agency", ""},	
		{"site", "other officies", ""},	
		{"tel", "telephone number", ""},	
	};


	/** those are the return values of methods of this class
	*/
	public final static int OK = 0;
	public final static int NULL_STREAM = 3;
	public final static int NULL_DOCUMENT_STRING = 6;
	public final static int NULL_SGML_STRING = 8;
	public final static int EXTRACT_ERROR = 9;
	public final static int INVALID_TAG_NAME = 10;
	public final static int NULL_FIELD_NAME = 11;

	/** just used for debugging
	*/
	Debug debug;


	/**
		create an SgmlDocument object from the specified stream.
		@param	br	the stream from which to read out this document 
		@param	debug	used for debugging
	*/      
	public SgmlDocument(BufferedReader br, Debug debug)
	{
		this.debug = debug;

		if (br == null)
		{
			debug.dumpTrace("SgmlDocument.[constructor 1]: br is null");
			return;
		}

		// br is not null, read out raw string from it
		int num_lines = 0;
		try
		{
			num_lines = Integer.parseInt(br.readLine());
		}
		catch (Exception e0)
		{
			debug.dumpTrace("SgmlDocument.[constructor 1]: error reading number of lines");
		}
		if (num_lines > 0)	// Even the null string gets an empty line.
		{
			int i;
			sgmlString = new String("");
			try
			{
				for (i = 0; i < num_lines; i++)
				{
					if (i == 0)
					{
						// the beginning
						sgmlString += br.readLine();
					}
					else
					{
						// not the first line, so add a line separator between them
						sgmlString += System.getProperty("line.separator") + br.readLine();
					}
				}
			}
			catch (IOException e1)
			{
				debug.dumpTrace("SgmlDocument.[constructor 1]: error reading sgml string");
				sgmlString = null;
			}
		}
	}


	/**
		create an SgmlDocument object from a document string.
		@param	docString	a string encoding this document in SGML
		@param	debug	used for debugging
	*/      
	public SgmlDocument(String documentString, Debug debug)
	{
		this.debug = debug;

		sgmlString = documentString;

	}

	/**
		tell whether the object is valid (not whether it has been extracted yet).
	*/      
	public boolean isValid()
	{
		return( sgmlString != null );
	}


	/**
		tell whether this object and the 
			parameter object represent the same document.
		<P><B>NOTE:</B>  At this point we are using String compare on the
			raw strings to determine equality.  This obviously leaves
			something to be desired.
		@param	d	the document used to compare with this object
		@return	true / false
	*/      
	public boolean equals(SgmlDocument d)
	{
		if (d == null)
		{
			debug.dumpTrace("SgmlDocument.equals(): d is null");
			return( false );
		}

		// Not a great implementation.  What if d differs only by white space?
		return( sgmlString.equals(d.sgmlString) );
	}


	/**
		print the contents of this object to the specified stream.
		@param	pw	the stream to which to write this object
		@return	OK -- this object has been written to the stream correctly
		<P>	NULL_STREAM -- the parameter stream is null
	*/      
	public int toStream(PrintWriter pw)
	{
		if (pw == null)
		{
			debug.dumpTrace("SgmlDocument.toStream(): parameter stream is null");
			return NULL_STREAM;
		}

		// used to count lines
		pw.println(LinedString.count_lines(sgmlString));
		pw.println(sgmlString);

		return OK;
	}


	/**
		return the sgml string of the document this object represents.
		@return	the raw form of this document as a string
	*/      
	public String getDocumentString()
	{
		return sgmlString;
	}


	/**
		set the sgml string of the document this object represents.
		@param	documentString	this will become the new raw string for 
				this document object
		@return	OK -- the new raw marc record has been set correctly
		<P>	NULL_DOCUMENT_STRING -- the parameter is null
	*/      
	public int setDocumentString(String documentString)
	{
		if (documentString == null)
		{
			debug.dumpTrace("SgmlDocument.setDocumentString(): parameter documentString is null");
			return NULL_DOCUMENT_STRING;
		}

		// document string is not null
		sgmlString = documentString;
		extracted = false;
		return OK;
	}


	/**
		extract sgml string into different fields.
		@return	OK -- the characteristics have been extracted successfully
		<P>	other -- other problems
	*/      
	private int extract()
	{
		if (sgmlString == null)
		{
			debug.dumpTrace("SgmlDocument.extract(): sgml string is null");
			return NULL_SGML_STRING;
		}

		// sgml string is not null, extract it to different fields
		fields = new Vector();

		// do some error checking here
		int start_index = sgmlString.indexOf("<DOC>");
		if (start_index == -1)
		{
			debug.dumpTrace("SgmlDocument.extract(): sgml string doesn't contain doc tag");
			return EXTRACT_ERROR;
		}
		start_index += 5;
		int end_index = sgmlString.indexOf("</DOC>");
		if (end_index == -1)
		{
			debug.dumpTrace("SgmlDocument.extract(): sgml string doesn't contain /doc tag");
			return EXTRACT_ERROR;
		}
		if (start_index >= end_index)
		{
			debug.dumpTrace("SgmlDocument.extract(): tags doc and /doc misplaced");
			return EXTRACT_ERROR;
		}

		// parsing all the tags
		int tag_name_begin_index, tag_name_end_index, tag_data_begin_index, 
			tag_data_end_index;
		while (start_index < end_index)
		{
			// get the name of the tag
			tag_name_begin_index = sgmlString.indexOf("<", start_index);
			if ((tag_name_begin_index == -1) || (tag_name_begin_index == end_index))
			{
				break;
			}


			tag_name_end_index = sgmlString.indexOf(">", tag_name_begin_index);
			if ((tag_name_end_index == -1) || (tag_name_end_index > end_index))
			{
				debug.dumpTrace("SgmlDocument.extract(): tag name not ended");
				return EXTRACT_ERROR;
			}

			if (tag_name_begin_index == (tag_name_end_index - 1))
			{
				debug.dumpTrace("SgmlDocument.extract(): empty tag identified");
				return EXTRACT_ERROR;
			}
			String tag_name = sgmlString.substring(tag_name_begin_index + 1, tag_name_end_index);

			// get the data of the tag
			tag_data_begin_index = tag_name_end_index + 1;
			tag_data_end_index = sgmlString.indexOf("</" + tag_name + ">",
				tag_data_begin_index );
			if ((tag_data_end_index == -1) || (tag_data_end_index >= end_index))
			{
				debug.dumpTrace("SgmlDocument.extract(): parse field error");
				return EXTRACT_ERROR;
			}
			String tag_data = sgmlString.substring(tag_data_begin_index, tag_data_end_index);

			// remove tags from tag data and then add this tag to fields vector
			tag_data = filt(tag_data);
			add_tag(tag_name, tag_data);

			// prepare for the next tag
			start_index = tag_data_end_index + 3 + tag_name.length();
		}

		// all the tags has been parsed
		extracted = true;
		return OK;
	}


	/**
		remove all the tags inside the parameter tag_data
		@return	OK -- the characteristics have been extracted successfully
		<P>	other -- problems
	*/      
	private String filt(String tag_data)
	{
		int tag_begin = tag_data.indexOf('<');
		int tag_end = tag_data.indexOf('>');
		int current_position = 0;
		StringBuffer sb = new StringBuffer();
		while ((tag_begin != -1) && (tag_end != -1))
		{
			if (tag_begin > tag_end)
			{
				// this is not a tag, skip it
				sb.append(tag_data.substring(current_position, tag_begin));
				current_position = tag_begin + 1;
				continue;
			}

			// find a tag, remove it
			sb.append(tag_data.substring(current_position, tag_begin));
			current_position = tag_end + 1;
			tag_begin = tag_data.indexOf('<', tag_end + 1);
			tag_end = tag_data.indexOf('>', tag_end + 1);
		}
		
		// boundary condition
		sb.append(tag_data.substring(current_position, tag_data.length()));
		return sb.toString();
	}

	
	/**
		add the tag to the corresponding position of the fields vector
		@return	OK -- the characteristics have been extracted successfully
		<P>	other --
	*/      
	private int add_tag(String tag_name, String tag_data)
	{
		String field_name = null;
		int i, j;
		for (i = 0; i < tagToField.length; i++)
		{
			if (tagToField[i][0].equals(tag_name))
			{
				// we found it, assume there is no duplicats
				field_name = tagToField[i][1];
				break;
			}
		}

		if (field_name == null)
		{
//			debug.dumpTrace("SgmlDocument.add_tag(): invalid tag name");
			return INVALID_TAG_NAME;
		}

		// put it into corresponding position of the fields vector
		for (j = 0; j < fields.size(); j+= 2)
		{
			if (field_name.equals((String) fields.elementAt(j)))
			{
				// we find the field, append to it
				String new_field_data = ((String) fields.elementAt(j + 1))
					+ tagToField[i][2] + tag_data;
				fields.setElementAt(new_field_data, j + 1);
				return OK;
			}
		}

		// this is the first time this field appears, append it to the 
		// end of the vector
		fields.addElement(field_name);
		fields.addElement(tag_data);
		return OK;
	}


	/**
		return the number of fields in this document
	*/
	public int getNumberFields()
	{
		if (! extracted)
		{
			extract();
		}

		return fields.size() / 2;
	}


	/**
		return the name of the specified field.
	*/
	public String getFieldNameByIndex(int index)
	{
		if (! extracted)
		{
			extract();
		}

		if ((index < 0) || (index >= (fields.size() / 2)))
		{
			debug.dumpTrace("SgmlDocument.get_field_name_by_index(): index is not inlid");
			return null;
		}

		// valid index, return the name of the field
		return (String) fields.elementAt(index * 2);
	}


	/**
		return the data of the specified field.
	*/
	public String getFieldDataByIndex(int index)
	{
		if (! extracted)
		{
			extract();
		}

		if ((index < 0) || (index >= (fields.size() / 2)))
		{
			debug.dumpTrace("SgmlDocument.get_field_name_by_index(): index is not inlid");
			return null;
		}

		// valid index, return the name of the field
		return (String) fields.elementAt(index * 2 + 1);
	}


	/**
		return the data of this document corresponding to the specified field
		@return	the field data in the form of a String, or
		<P>	null -- extraction problem
	*/      
	public String getFieldData(String field_name)
	{
		if (field_name == null)
		{
			debug.dumpTrace("SgmlDocument.getFieldData(): parameter field_name is null");
			return null;
		}

		// try to find the data of the field
		if (! extracted)
		{
			extract();
		}

		for (int i = 0; i < fields.size(); i += 2)
		{
			if (field_name.equals((String) fields.elementAt(i)))
			{
				// we found the field, return its data
				return (String) fields.elementAt(i + 1);
			}
		}

		// this is no such field in this document
		return null;
	}


	/**
		tell the separator between different text strings in the specified field.
		@param	fieldName	name of field to search
		@return String, or null if no such field exists.
	*/      
	public String getFieldSeparator(String fieldName)
	{
		if (fieldName == null)
		{
			debug.dumpTrace("SgmlDocument.get_field_separator(): parameter field name is null");
			return null;
		}

		// try to find the separator of this field
		for (int i = 0; i < tagToField.length; i++)
		{
			if (fieldName.equals(tagToField[i][1]))
			{
				// we found a match, return its seperator
				return tagToField[i][2];
			}
		}

		// there is no such field in the document
		debug.dumpTrace("SgmlDocument.getFieldSeparator(): invalid fieldName");
		return null;
	}

	
	/**
		An attempt to get around declaring public clone() methods.
	*/
	public DigInfObj copy()
	{
		return( (DigInfObj) new SgmlDocument(sgmlString, debug) );
	}


	/**
		return the short description of this document in one line.
		@param	markupType	how to mark up the string returned (e.g., HTML or ASCII).
		@return	the short description String.
	*/
	public String presentShort(int markupType)
	{
		String s = getFieldData("name");
		if (s == null)
		{
			return "";
		}

		s = s.replace('\n', ' ');
		String s1 = getFieldData("acronym");
		if (s1 != null)
		{
			// add the acronym too
			s1 = s1.replace('\n', ' ');
			s += " (" + s1 + ")";
		}

		return s;
	}


	/**
		return a Vector of metadata attributes for this document.
		@param	markupType	how to mark up the string returned (e.g., HTML or ASCII).
		@return	a Vector of triples [attrName, attrType, attrValue].
	*/
	public Vector presentAttributes(int markupType)
	{
		debug.dumpTrace("SgmlDocument.presentAttributes(): not yet implemented");
		return null;
	}
	
	public Vector attributes()
	{
		debug.dumpTrace("SgmlDocument.pattributes(): not yet implemented");
		return null;
	}

	public Object presentAttribute(int attrID, int markupType)
	{
		debug.dumpTrace("SgmlDocument.presentAttribute(): not yet implemented");
		return null;
	}


	/**
		return the full description of this document.
		@param	markupType	how to mark up the string returned (e.g., HTML or ASCII).
		@return	a (potentially very long) String.
	*/
	public String presentFull(int markupType)
	{
		String s, s1; 
		StringBuffer sb = new StringBuffer(2048);	// A guess on the size.
		// format the title
		s = getFieldData("name");
		if (s != null)
		{
			s = s.replace('\n', ' ');
			sb.append("<FONT SIZE=+1><STRONG>" + s);
			s1 = getFieldData("acronym");
			if (s1 != null)
			{
				// add the acronym too
				s1 = s1.replace('\n', ' ');
				sb.append(" (" + s1 + ")");
			}
			sb.append("</STRONG></FONT>");
		}
		// format other sections 
		format_section_1("description", "Description", sb);
		format_section("contact person", "Contact Person(s)", sb);
		format_section("address", "Address", sb);
		format_section("telephone number", "Telephone Number(s)", sb);
		format_section_special("email", sb);
		format_section_1("general notes", "General Notes", sb);
		format_section_1("holdings", "Holdings", sb);
		format_section_1("limitations on use", "Limitations On Use", sb);
		format_section_1("publications", "Publications", sb);
		format_section("other officies", "Other Officies", sb);
		format_section("type of orgnization", "Type of Orgnization", sb);
		format_section("sponsoring agency", "Sponsoring Agency(s)", sb);
		format_section("region", "NULM Region Number", sb);
		format_section("cross reference", "Cross Reference(s)", sb);
		format_section("mesh heading", "MeSH Heading(s)", sb);
		format_section("keyword", "Keyword(s)", sb);
		
		return( new String(sb) );
	}
	public int presentShort(int markupType, BufferedWriter out) throws IOException
	{		String str = presentShort(markupType);
		if ( str == null )
			return( ReturnCodes.NO_CAN_DO );		out.write(str);
	    return( ReturnCodes.OK );
	}
    public String presentLong(int markupType)
	{
	    return( presentFull(markupType) );
	}
	public int presentLong(int markupType, BufferedWriter out) throws IOException
	{
		String str = presentFull(markupType);		if ( str == null )
			return( ReturnCodes.NO_CAN_DO );		out.write(str);
	    return( ReturnCodes.NOT_YET_IMPLEMENTED );
	}
	public int presentFull(int markupType, BufferedWriter out) throws IOException
	{
		String str = presentFull(markupType);		if ( str == null )
			return( ReturnCodes.NO_CAN_DO );		out.write(str);
	    return( ReturnCodes.NOT_YET_IMPLEMENTED );
	}

	/** this method will format a section of a sgml document to html string
		and append the string to the end of the parameter StringBuffer
	*/
	private void format_section(String section_name, String section_header, StringBuffer sb)
	{
		String s = getFieldData(section_name);
		if ((s == null) || s.equals(""))
		{
			return;
		}

		// format the header
		sb.append("<P><FONT SIZE=+1><STRONG>" + section_header + "</STRONG></FONT><P>");
		// format the body
		int begin_index = 0;
		int end_index = s.indexOf(System.getProperty("line.separator"), begin_index);
		while (end_index != -1)
		{
			// we find a line separator, change it to html <BR> tag.
			sb.append(s.substring(begin_index, end_index) + "<BR>" + System.getProperty("line.separator"));
			begin_index = end_index + 1;
			end_index = s.indexOf(System.getProperty("line.separator"), begin_index);
		}

		// process the boundary condition
		if (begin_index != s.length())
		{
			sb.append(s.substring(begin_index));
		}
	}

	/** this method will format a section of a sgml document to html string
		and append the string to the end of the parameter StringBuffer, it will 
		not change line separators to html <BR> tags.
	*/
	private void format_section_1(String section_name, String section_header,
					StringBuffer sb)
	{
		String s = getFieldData(section_name);
		if ((s == null) || s.equals(""))
		{
			return;
		}
		// format the header
		sb.append("<P><FONT SIZE=+1><STRONG>" + section_header + "</STRONG></FONT><P>" 
			+ s);
	}

	/** this method deals with special fields, currently only email and
		URL are supported
	*/
	private void format_section_special(String section_name, StringBuffer sb)
	{
		if (! section_name.equals("email"))
		{
			debug.dumpTrace("class response, method format_section_special,invalid section name");
			return;
		}
		String s = getFieldData(section_name);
		if ((s == null) || s.equals(""))
		{
			return;
		}
		StringBuffer eml_sb = new StringBuffer();
		StringBuffer url_sb = new StringBuffer();
		StringTokenizer st = new StringTokenizer(s, System.getProperty("line.separator"));
		String s1 = null;
		while (st.hasMoreTokens())
		{
			s1 = st.nextToken();
			if (s1.indexOf('@') != -1)
			{
				// this is an email address
				eml_sb.append(s1 + "<BR>" + System.getProperty("line.separator"));
			}
			else
			{
				// this is an URL address
				url_sb.append(s1 + "<BR>" + System.getProperty("line.separator"));
			}
		}
		if (eml_sb.length() != 0)
		{
			// there is at least one email address
			sb.append("<P><FONT SIZE=+1><STRONG>Email(s)</STRONG></FONT><p>");
			sb.append(eml_sb.toString());
		}
		if (url_sb.length() != 0)
		{
			// there is at least one URL address
			sb.append("<P><FONT SIZE=+1><STRONG>URL(s)</STRONG></FONT><p>");
			sb.append(url_sb.toString());
		}
	}
}
