package edu.vt.marian.search; import java.io.*; import java.net.*; import java.util.*; import edu.vt.marian.common.*; /** DienstClassManager is the class that extends the capability of the Marian System by extending searches to be carried out onto the Dienst System. It also allows for the retrieval of Documents from the Dienst Database. All these operations are transparent i.e the end User does not have to format his queries to suit the Dienst System. In fact the user does not even now about the searches being made on the Dienst System.

@author Nilesh Phadke

@see edu.vt.marian.common.FullID @see edu.vt.marian.common.WtdObj @see edu.vt.marian.search.VectorWtdObjSet */ public class DienstClassManager implements NodeClassManager { protected Debug debug; // The inevitable. protected String DocFormat; protected int ClassID; protected Hashtable MarianDienstIDs=new Hashtable(); protected FullID F; protected WtdObj weightedobj; protected VectorWtdObjSet VObjSet; protected int InstanceID; protected String[] authors; protected String[] titles; protected String[] keywords; protected String[] abstracts; protected int authorsArrayIndex=0,titlesArrayIndex=0,keywordsArrayIndex=0,abstractsArrayIndex=0; /** * Constructor for DienstClassManager .. accepts the ClassID as input */ public DienstClassManager ( int Clid, Debug d ) { debug = d; ClassID=Clid; int result=F.setClassID(ClassID); if (result != ReturnCodes.OK) { System.out.println("ERROR"); } DocFormat = new String("/text/html"); } /** The Following Function (i.e polishstring(String str) ) is require because ..... The syntax rules for URLs restrict a few characters to special roles. and require that if these characters are used in any other way that they be written as an escape sequence, a percent sign followed by the character code in hexadecimal. The reserved characters are: / - separates components in the URL. ? - separates optional arguments from the rest of the URL # - indicates reference to a named anchor within a document = - separates name from value in an argument list & - separates multiple arguments after a ? Note that the slash character used in handles must be encoded when expressed in a URL. (The encoding is %2F, by the way.) Finally, the space character may not appear anyplace. It must be written with a "+" (or with a percent sign escape sequence.) */ protected String polishstring(String str) { int index=0; String outstring,tempstring; char oldchar='/'; char newchar='%'; outstring=str; //We need to replace / by %2F so we insert 2F after every occurence of / then finally we use the replace function to replace all / by % index=outstring.indexOf("/",index); while(index!=-1) { index=outstring.indexOf("/",index); tempstring=(outstring.substring(0,index+1)); tempstring=tempstring.concat("2f"+outstring.substring(index+1)); outstring=tempstring; index++; index=outstring.indexOf("/",index); } outstring=outstring.replace('/','%'); //we need to replace all white spaces by '+' outstring=outstring.replace(' ','+'); return(outstring); } /* Match Function has three main parts.... -> extract the information from Information passed onto it (InfoDesc) and convert it to a form useful for searching onto the Dienst System -> using the search string formed in the previous step carry out the actual search on the Dienst System -> capture the reply from the Dienst Server and corresponding to every unique document retrieved, create a Full ID and store the data in a table. */ public WtdObjSet match(InfoDesc description) { int thingstosearch=0; Object firstobject, lastobject, currentobject; int first, last, current; // extracting info from infodesc Enumeration linkDescsEnum = description.enumLinkDescs(); while( linkDescsEnum.hasMoreElements() ) { currentobject= linkDescsEnum.nextElement(); LinkDesc currentLink = (LinkDesc) currentobject; int currentClassID = currentLink.getClassID(); thingstosearch++; InfoDesc stringDesc = currentLink.getKeyDesc(); String currentString = (String) stringDesc.getNodeDesc(); switch(currentClassID) { case ClassIDs.CLASS_HAS_AUTHOR : case ClassIDs.CLASS_HAS_CONF_AUTHOR : case ClassIDs.CLASS_HAS_CORP_AUTHOR : authors[authorsArrayIndex++]=currentString; break; case ClassIDs.CLASS_HAS_KEYWORD : case ClassIDs.CLASS_HAS_SUBJECT : // this is to be considered as a keyword search in dienst ; keywords[keywordsArrayIndex++]=currentString; break; case ClassIDs.CLASS_HAS_ABSTRACT : abstracts[abstractsArrayIndex++]=currentString; break; case ClassIDs.CLASS_HAS_TITLE : titles[titlesArrayIndex++]=currentString; break; default : thingstosearch--; break; } // end of switch } //end of while if(thingstosearch!=0) { try { search(); } catch (Exception e) { System.out.println("DienstClassManager.match(): exception " + e.toString() + " while contacting remote server."); return( null ); } } //return WtdObjSet return( VObjSet ); } //end of match function /* The following Function just takes an int and converts it to a Integer. This is required because the put function of Hashtable requires that the hashcode be of type java.lang.Object and does not allow to directly use int as a hashcode for storing in the hashtable */ protected Integer intTOInteger( int val) { Integer returninteger=new Integer(val); return(returninteger); } protected void search() throws Exception { String authorstring="",titlestring="",keywordstring="",abstractstring="",booleansearchstring="" ; int i; // forming the search string if(authorsArrayIndex>=1) { authorstring="author="+authors[0]; for (i=1;i=1) { titlestring="title="+titles[0]; for (i=1;i=1) booleansearchstring=booleansearchstring+"&"+titlestring; else booleansearchstring=titlestring; } if(abstractsArrayIndex>=1) { abstractstring="abstract="+abstracts[0]; for (i=1;i=1)||(titlesArrayIndex>=1)) booleansearchstring=booleansearchstring+"&"+abstractstring; else booleansearchstring=abstractstring; } if(keywordsArrayIndex>=1) { keywordstring="keyword="+keywords[0]; for (i=1;i=1 || abstractsArrayIndex>=1 || titlesArrayIndex>=1 ) { URL url = new URL("http://128.84.248.7:80/Dienst/Index/2.0/SearchBoolean?"+booleansearchstring); URLConnection connection = url.openConnection(); connection.setDoOutput(true); BufferedReader in = new BufferedReader( new InputStreamReader(connection.getInputStream())); String inputLine; i = 0; while ((inputLine = in.readLine()) != null) { if((i%5)==0) //every fifth line is a handle to the dienst document { //Generating a fullID corresponding to a handle and storing it in the Hashtable int result = F.setInstanceID(InstanceID++); weightedobj = new WtdObj(F, Weight.topWt, debug); VObjSet.add(weightedobj); MarianDienstIDs.put(F, new String(inputLine)); System.out.println(F.toString() +" "+inputLine); } i++; } //end of while in.close(); } //KeyWord Search if(keywordsArrayIndex>=1) { URL url = new URL("http://128.84.248.7:80/Dienst/Index/2.0/SearchBoolean?"+keywordstring); URLConnection connection = url.openConnection(); connection.setDoOutput(true); BufferedReader in = new BufferedReader( new InputStreamReader(connection.getInputStream())); String inputLine; i = 0; try { while ( (inputLine = in.readLine()) != null) { if ( (i%5) == 0 ) //every fifth line is a handle to the dienst document { //Generating a fullID corresponding to a handle and storing it in the Hashtable int result = F.setInstanceID(InstanceID++); weightedobj = new WtdObj(F, Weight.topWt, debug); VObjSet.add(weightedobj); MarianDienstIDs.put(F, new String(inputLine)); System.out.println(F.toString() + " " + inputLine); } i++; } //end of while } //end of try catch (Exception e) {return;} in.close(); } } // end of search /* The Following Function i.e fullIDToObject is the function for Document Retrieval It accepts a Marian FullID as Input , finds the corresponding Dienst Handle The Other Input is the DocType i.e The Type in which the the Document is to be retrieved. The next step is to Carry out search on the Dienst Server Return the Document as a String */ /* Dienst Supports the Following Document Formats text :- plain ASCII text, sent as text/plain ocr :- ASCII text produced by OCR, sent as text/plain scanned :- scanned page image, usually TIFF, at no less than 300 spots per inch. inline :- a page image, suitable for screen display. Usually a GIF, at about 72 dots per inch, four bits per pixel. structure :- A document structure file html :- an HTML document, sent as text/html */ public Object idToObject(FullID Fid) { String MarianID; String Doc; String DocBody = new String(); MarianID = (String) MarianDienstIDs.get(Fid); try { URL url = new URL("http://128.84.248.7:80/Dienst/Repository/2.0/Body/"+MarianID+DocFormat); URLConnection connection = url.openConnection(); connection.setDoOutput(true); BufferedReader in = new BufferedReader( new InputStreamReader(connection.getInputStream())); String inputLine; while ((inputLine = in.readLine()) != null) { DocBody=DocBody+inputLine; } //end of while in.close(); } catch (Exception e) {return( null );} return (DocBody); } public Vector idsToObjects(Vector Vids) { return(null); } public boolean isInClass(FullID id) { if (MarianDienstIDs.get(id) == null) return(false); else return(true); } public long classSize() { return(ReturnCodes.NO_CAN_DO); } }