uk.ac.man.entitytagger
Class Mention

java.lang.Object
  extended by uk.ac.man.entitytagger.Mention
All Implemented Interfaces:
java.io.Serializable, java.lang.Comparable<Mention>

public class Mention
extends java.lang.Object
implements java.lang.Comparable<Mention>, java.io.Serializable

Class for representing a text match, containing the matched text, document coordinates and normalized IDs.

Author:
Martin
See Also:
Serialized Form

Nested Class Summary
private  class Mention.IDPair
           
 class Mention.SimpleMention
           
 
Field Summary
private  java.lang.String comment
           
static java.lang.String COMMENT_SEPARATOR
           
private  java.lang.String docid
           
private  int end
           
private  java.lang.String[] idLines
           
private  java.lang.String[] ids
           
private  java.lang.Double[] probabilities
           
private static long serialVersionUID
           
private  int start
           
private  java.lang.String text
           
 
Constructor Summary
Mention(java.lang.String[] ids)
           
Mention(java.lang.String[] ids, int start, int end, java.lang.String text)
           
Mention(java.lang.String id, int start, int end, java.lang.String text)
           
 
Method Summary
 void addToPstmtBatch(java.sql.PreparedStatement pstmt)
          Saves the match to a database using PreparedStatements.
 Mention clone()
           
 int compareTo(Mention o)
           
 boolean containsID(java.lang.String id)
           
 void disambiguate(java.lang.String id)
          If containsID(id) is true, will set ids to id only (any other ids will be deleted), otherwise throws an exception
 boolean equals(java.lang.Object o)
           
static Mention findClosestMention(java.util.List<Mention> mentions, int pos)
           
 java.lang.String getComment()
           
 java.lang.String getDocid()
           
 int getEnd()
           
 java.lang.String[] getIds()
           
 java.lang.String getIdsToString()
           
 java.lang.String[] getIdsWithLineNumbers()
           
static java.util.List<Mention> getMentionsInRange(java.util.List<Mention> mentions, int start, int end)
           
 java.lang.String getMostProbableID()
           
 java.lang.String getMostProbableIDWithIdLine()
           
 java.lang.Double[] getProbabilities()
           
 int getStart()
           
 java.lang.String getText()
           
 boolean isAmbigous()
           
static java.util.List<Mention> loadFromFile(java.io.File file)
           
static java.util.List<Mention> loadFromFile(java.io.File file, java.util.Set<java.lang.String> validDocumentIDs, java.lang.String restrictPostfix, java.util.HashMap<java.lang.String,java.lang.String> conversionMap)
          Will load a set of matches from a file, constrained such that only matches from documents in validDocumentIDs are returned.
static java.util.Map<java.lang.String,java.util.List<Mention>> loadFromFileToHash(java.io.File file, java.util.Set<java.lang.String> validDocumentIDs, java.lang.String restrictPostfix, java.util.HashMap<java.lang.String,java.lang.String> conversionMap)
          Loads matches using loafFromFile(file,validDocumentIDs,conversionMap) and then splits them up to allow access by document ID.
 boolean overlaps(Mention m2)
           
static boolean overlaps(Mention m1, Mention m2)
           
 boolean overlapsIgnoreDoc(Mention n)
           
static void saveToFile(java.util.ArrayList<Mention> matches, java.io.File file)
          Saves the list of matches to file, sorted by their start and end coordinates.
static void saveToStreamInBCFormat(java.io.BufferedWriter outStream, java.util.List<Mention> mentions, java.lang.String restrictBySpecies, java.util.Map<java.lang.String,java.lang.String> toSpeciesMap)
          Saves a list of mentions to a stream, in BioCreative 2 format, suitable for evaluation.
 void setComment(java.lang.String comment)
           
 void setDocid(java.lang.String docid)
           
 void setEnd(int end)
           
 void setIds(java.lang.String[] ids)
          Will also clear the probabilities.
 void setProbabilities(java.lang.Double[] probabilities)
           
 void setStart(int start)
           
 void setText(java.lang.String text)
           
 Mention.SimpleMention simplify(java.util.Map<java.lang.String,java.lang.String> descriptionMap)
           
static void sort(java.util.List<Mention> mentions)
           
 void sortIDsByProbabilities()
           
 java.lang.String toString()
           
 
Methods inherited from class java.lang.Object
finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
 

Field Detail

serialVersionUID

private static final long serialVersionUID
See Also:
Constant Field Values

COMMENT_SEPARATOR

public static final java.lang.String COMMENT_SEPARATOR
See Also:
Constant Field Values

start

private int start

end

private int end

text

private java.lang.String text

ids

private java.lang.String[] ids

comment

private java.lang.String comment

docid

private java.lang.String docid

probabilities

private java.lang.Double[] probabilities

idLines

private java.lang.String[] idLines
Constructor Detail

Mention

public Mention(java.lang.String[] ids)

Mention

public Mention(java.lang.String id,
               int start,
               int end,
               java.lang.String text)

Mention

public Mention(java.lang.String[] ids,
               int start,
               int end,
               java.lang.String text)
Method Detail

clone

public Mention clone()
Overrides:
clone in class java.lang.Object

saveToStreamInBCFormat

public static void saveToStreamInBCFormat(java.io.BufferedWriter outStream,
                                          java.util.List<Mention> mentions,
                                          java.lang.String restrictBySpecies,
                                          java.util.Map<java.lang.String,java.lang.String> toSpeciesMap)
Saves a list of mentions to a stream, in BioCreative 2 format, suitable for evaluation.

Parameters:
outStream -
mentions - The mentions, from a _single_ document.
restrictBySpecies - if not null, any mentions not from the specified species will be ignored
toSpeciesMap - map from mention ids to species ids, used if restrictBySpecies is not null

containsID

public boolean containsID(java.lang.String id)
Parameters:
id -
Returns:
whether the Match contains the given id

simplify

public Mention.SimpleMention simplify(java.util.Map<java.lang.String,java.lang.String> descriptionMap)

disambiguate

public void disambiguate(java.lang.String id)
If containsID(id) is true, will set ids to id only (any other ids will be deleted), otherwise throws an exception

Parameters:
id -
Throws:
java.lang.IllegalStateException - if containsID(id) == false

toString

public java.lang.String toString()
Overrides:
toString in class java.lang.Object
Returns:
a String on the format "iddoc idstartendtextcomment" (without citation marks)

getIds

public java.lang.String[] getIds()

getStart

public int getStart()

getEnd

public int getEnd()

isAmbigous

public boolean isAmbigous()
Returns:
getIds().length > 1

equals

public boolean equals(java.lang.Object o)
Overrides:
equals in class java.lang.Object

overlaps

public static boolean overlaps(Mention m1,
                               Mention m2)
Parameters:
m1 -
m2 -
Returns:
whether m1 and m2 overlaps

overlaps

public boolean overlaps(Mention m2)
Parameters:
m2 -
Returns:
whether the match overlaps with m2

loadFromFileToHash

public static java.util.Map<java.lang.String,java.util.List<Mention>> loadFromFileToHash(java.io.File file,
                                                                                         java.util.Set<java.lang.String> validDocumentIDs,
                                                                                         java.lang.String restrictPostfix,
                                                                                         java.util.HashMap<java.lang.String,java.lang.String> conversionMap)
Loads matches using loafFromFile(file,validDocumentIDs,conversionMap) and then splits them up to allow access by document ID. Using the resulting HashMap, the list of matches for a specific document can quickly be retrieved.

Parameters:
file -
validDocumentIDs -
conversionMap -
Returns:
a HashMap mapping document IDs to a list of matches for that document.

loadFromFile

public static java.util.List<Mention> loadFromFile(java.io.File file)

loadFromFile

public static java.util.List<Mention> loadFromFile(java.io.File file,
                                                   java.util.Set<java.lang.String> validDocumentIDs,
                                                   java.lang.String restrictPostfix,
                                                   java.util.HashMap<java.lang.String,java.lang.String> conversionMap)
Will load a set of matches from a file, constrained such that only matches from documents in validDocumentIDs are returned. If conversionMap isn't null, matches can also be returned if validDocumentIDs.contains(conversionMap.get(documentID_for_the_match)) is true.

Parameters:
file -
validDocumentIDs -
conversionMap -
Returns:
the set of matches with valid document IDs

saveToFile

public static void saveToFile(java.util.ArrayList<Mention> matches,
                              java.io.File file)
Saves the list of matches to file, sorted by their start and end coordinates.

Parameters:
matches -
file -

compareTo

public int compareTo(Mention o)
Specified by:
compareTo in interface java.lang.Comparable<Mention>

setStart

public void setStart(int start)

setEnd

public void setEnd(int end)

getText

public java.lang.String getText()
Returns:
the text

getComment

public java.lang.String getComment()
Returns:
extra data that can be associated with the object

setComment

public void setComment(java.lang.String comment)
Parameters:
comment - extra data that can be associated with the object

getDocid

public java.lang.String getDocid()
Returns:
the document id

setDocid

public void setDocid(java.lang.String docid)
Parameters:
docid - the document id to set

getProbabilities

public java.lang.Double[] getProbabilities()
Returns:
the probabilities that might be associated with the IDs (this might not exist, in which case some/all might be null)

setProbabilities

public void setProbabilities(java.lang.Double[] probabilities)
Parameters:
probabilities - an array of probabilities associated with the IDs, need to be the same number as the IDs

setText

public void setText(java.lang.String text)
Parameters:
text - the text to set

getMostProbableID

public java.lang.String getMostProbableID()

addToPstmtBatch

public void addToPstmtBatch(java.sql.PreparedStatement pstmt)
Saves the match to a database using PreparedStatements.

Parameters:
pstmt - Statement for inserting the match to a database, with the following fields: 1: entity id, 2: document, 3: start, 4: end, 5: text, 6: comment

setIds

public void setIds(java.lang.String[] ids)
Will also clear the probabilities.

Parameters:
ids - the ids to set

getMostProbableIDWithIdLine

public java.lang.String getMostProbableIDWithIdLine()

getIdsWithLineNumbers

public java.lang.String[] getIdsWithLineNumbers()

getIdsToString

public java.lang.String getIdsToString()

findClosestMention

public static Mention findClosestMention(java.util.List<Mention> mentions,
                                         int pos)

getMentionsInRange

public static java.util.List<Mention> getMentionsInRange(java.util.List<Mention> mentions,
                                                         int start,
                                                         int end)

sortIDsByProbabilities

public void sortIDsByProbabilities()

sort

public static void sort(java.util.List<Mention> mentions)

overlapsIgnoreDoc

public boolean overlapsIgnoreDoc(Mention n)