uk.ac.man.entitytagger.evaluate
Class Evaluate

java.lang.Object
  extended by uk.ac.man.entitytagger.evaluate.Evaluate

public class Evaluate
extends java.lang.Object


Nested Class Summary
(package private)  class Evaluate.Tag
          Simple dataholder class for a single mention
 
Constructor Summary
Evaluate()
           
 
Method Summary
private  java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.util.List<Evaluate.Tag>>> convert(java.util.Map<java.lang.String,java.util.List<Mention>> hash)
           
private static void filterTagsByDocLength(java.util.Map<java.lang.String,java.util.List<Mention>> mainTaggedSet, java.util.Map<java.lang.String,java.lang.Integer> docLengthFilters)
           
static void filterTagsByRegexp(java.util.Map<java.lang.String,java.util.List<Mention>> mainTaggedSet, java.util.Map<java.lang.String,java.util.regex.Pattern> filters)
           
(package private) static java.util.Set<java.lang.String> getDocumentSelection(java.lang.String[] documents, int n)
          Randomly returns n document ids from an array of document ids.
private static java.util.Set<java.lang.String> getValidDocs(java.util.Set<java.lang.String> mainSet, java.util.Set<java.lang.String> mainTaggedSet, java.util.Set<java.lang.String> refSet, java.util.Set<java.lang.String> refTaggedSet, java.util.HashMap<java.lang.String,java.lang.String> articleConversions)
          Function which given document id sets will return a set of IDs that are relevant for evaluation.
private static java.util.HashMap<java.lang.String,java.lang.Integer> loadDocLengthFilters(DocumentIterator documents)
           
private static java.util.Set<java.lang.String> loadDocumentIDSet(java.io.File file)
           
private static java.util.HashMap<java.lang.String,java.lang.String> loadIndexfile(java.io.File file)
          Loads an index file correlating two sets of document ids with eachother, so that they can be mapped to eachother (becoming equivalent during the evaluation)
private static java.util.Set<java.lang.String> loadValidEntities(java.io.File file, java.lang.String prefix)
          Loads a list of entity ids that should be processed during evaluation (anything else will be ignored)
static void main(java.lang.String[] args)
           
private static void printEffectiveStats(java.lang.String title, java.util.Map<java.lang.String,java.util.List<Mention>> mainTaggedSet, java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.util.List<Evaluate.Tag>>> mainTagsBySpecies, java.util.Set<java.lang.String> doclist_a, java.util.Set<java.lang.String> doclist_b, java.util.Map<java.lang.String,java.lang.String> conversionMap)
          Function which will print a few statistics to System.out
(package private)  Result[] process(java.util.Map<java.lang.String,java.util.List<Mention>> mainTags, java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.util.List<Evaluate.Tag>>> mainTagsByDoc, java.util.Map<java.lang.String,java.util.List<Mention>> refTags, java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.util.List<Evaluate.Tag>>> refTagsByDoc, java.util.Map<java.lang.String,java.lang.String> articleConversionMap, java.util.Set<java.lang.String> validEntities, java.io.File logFile, boolean print, java.util.Set<java.lang.String> validDocIDs, java.lang.String title)
          The main evaluation processing method
private static void reduceIDs(java.util.Map<java.lang.String,java.util.List<Mention>> mainTaggedSet)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

Evaluate

public Evaluate()
Method Detail

process

Result[] process(java.util.Map<java.lang.String,java.util.List<Mention>> mainTags,
                 java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.util.List<Evaluate.Tag>>> mainTagsByDoc,
                 java.util.Map<java.lang.String,java.util.List<Mention>> refTags,
                 java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.util.List<Evaluate.Tag>>> refTagsByDoc,
                 java.util.Map<java.lang.String,java.lang.String> articleConversionMap,
                 java.util.Set<java.lang.String> validEntities,
                 java.io.File logFile,
                 boolean print,
                 java.util.Set<java.lang.String> validDocIDs,
                 java.lang.String title)
The main evaluation processing method

Parameters:
mainTags - main tag set
mainTagsByDoc - main tag set, separated by document
refTags - reference (gold-standard) tag set
refTagsByDoc - reference (gold-standard) tag set, separated by document
articleConversionMap - mapping between equivalent article IDs (e.g. PMID <-> PMCID)
validEntities - a set of all entities that are valid (anything not in this set is ignored)
logFile - the file where a list of TPs, FPs and FNs should be listed (may be null)
print - if true, will print evaluation results to System.out
validDocIDs - similar to validEntities, a set of document IDs that are valid (mentions from any documents not in this set will be ignored)
title - job title (will be printed before results)
Returns:
Result[]{document-level-result, mention-level-result}

convert

private java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.util.List<Evaluate.Tag>>> convert(java.util.Map<java.lang.String,java.util.List<Mention>> hash)

loadIndexfile

private static java.util.HashMap<java.lang.String,java.lang.String> loadIndexfile(java.io.File file)
Loads an index file correlating two sets of document ids with eachother, so that they can be mapped to eachother (becoming equivalent during the evaluation)

Parameters:
file -
Returns:
a map with mappings between document ids

loadDocumentIDSet

private static java.util.Set<java.lang.String> loadDocumentIDSet(java.io.File file)

loadValidEntities

private static java.util.Set<java.lang.String> loadValidEntities(java.io.File file,
                                                                 java.lang.String prefix)
Loads a list of entity ids that should be processed during evaluation (anything else will be ignored)

Parameters:
file -
prefix - prefix which will be added to the beginning of each ID (may be null)
Returns:

getDocumentSelection

static java.util.Set<java.lang.String> getDocumentSelection(java.lang.String[] documents,
                                                            int n)
Randomly returns n document ids from an array of document ids. This function is used for re-sampling statistical studies of accuracy robustness

Parameters:
documents - an array of document ids
n -
Returns:
a set of n ids from documents, randomly selected

main

public static void main(java.lang.String[] args)
Parameters:
args -

reduceIDs

private static void reduceIDs(java.util.Map<java.lang.String,java.util.List<Mention>> mainTaggedSet)

filterTagsByDocLength

private static void filterTagsByDocLength(java.util.Map<java.lang.String,java.util.List<Mention>> mainTaggedSet,
                                          java.util.Map<java.lang.String,java.lang.Integer> docLengthFilters)

loadDocLengthFilters

private static java.util.HashMap<java.lang.String,java.lang.Integer> loadDocLengthFilters(DocumentIterator documents)

filterTagsByRegexp

public static void filterTagsByRegexp(java.util.Map<java.lang.String,java.util.List<Mention>> mainTaggedSet,
                                      java.util.Map<java.lang.String,java.util.regex.Pattern> filters)
Parameters:
mainTaggedSet -
filters -

printEffectiveStats

private static void printEffectiveStats(java.lang.String title,
                                        java.util.Map<java.lang.String,java.util.List<Mention>> mainTaggedSet,
                                        java.util.Map<java.lang.String,java.util.Map<java.lang.String,java.util.List<Evaluate.Tag>>> mainTagsBySpecies,
                                        java.util.Set<java.lang.String> doclist_a,
                                        java.util.Set<java.lang.String> doclist_b,
                                        java.util.Map<java.lang.String,java.lang.String> conversionMap)
Function which will print a few statistics to System.out

Parameters:
title -
mainTaggedSet -
mainTagsBySpecies -
doclist_a -
doclist_b -
conversionMap -

getValidDocs

private static java.util.Set<java.lang.String> getValidDocs(java.util.Set<java.lang.String> mainSet,
                                                            java.util.Set<java.lang.String> mainTaggedSet,
                                                            java.util.Set<java.lang.String> refSet,
                                                            java.util.Set<java.lang.String> refTaggedSet,
                                                            java.util.HashMap<java.lang.String,java.lang.String> articleConversions)
Function which given document id sets will return a set of IDs that are relevant for evaluation.

Parameters:
mainSet - the set of document IDs in our main set that could _potentially_ have been tagged (some may not have any tags due to not containing any entities).
mainTaggedSet - the set of document IDS in our main set that have been tagged as containing entities
refSet - the set of document IDs in our reference set that could _potentially_ have been tagged (some may not have any tags due to not containing any entities).
refTaggedSet - the set of document IDS in our reference set that have been tagged as containing entities
articleConversions - String <-> String conversion map, for mapping e.g. PMIDs to PMCIDs.
Returns:
the set of document ids: (mainTaggedSet * refSet) + (refTaggedSet * mainSet).