#! /usr/local/bin/perl -w
# downloadCorpus.pl - Reads in a file containing list of abbreviations
# and their possible expansions. Downloads examples of these
# abbreviations from Medline (via Entrez) and removes the expanded
# form. (Phrases like "Acid Controller (AC)" and "AC (Acid
# Controller)" are substituted with "AC")
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see
# .
use LWP::Simple;
use Getopt::Std;
use Algorithm::ChooseSubsets;
use XML::Simple;
use POSIX;
#####################################################################################################
# GLOBAL VARIABLES and COMMAND LINE OPTIONS
#####################################################################################################
# Used by Entrez
my $utils = "http://www.ncbi.nlm.nih.gov/entrez/eutils";
my $db = "Pubmed";
my $report = "xml";
# Number of documents to retrieve at once
my $batch_size = 250;
# Maximum number of documents to retreive for each sense
my $max_docs = 5000;
# Temporary file
my $tmp_file = "TMP";
# Abstracts that have already been downloaded
%downloaded_abstracts = ();
# Command line options
%options = ();
getopts("hd:", \%options);
if(exists $options{h}) { &usage(); }
# Output directory (detault to corpus/)
if(exists $options{d}) {
$out_dir = $options{d};
} else {
$out_dir = "corpus/";
} # if/else
# Check whether output directory exists, create it if it doesn't
unless(-e $out_dir) {
mkdir($out_dir);
} # unless
#####################################################################################################
# START OF SCRIPT
#####################################################################################################
# Read data from input file
%abbreviations =();
while(<>) {
if(/^([A-Z]+) (M[0-9]+) (\"[^\"]+\")/) {
$abbreviation = $1;
$sense = $2;
$expansion = $3;
$abbreviations{$abbreviation}{$sense} = $expansion;
} # if
} # while
# Go through each of the abbreviations, create queries and retrieve documents
foreach $abbreviation (sort {$a cmp $b} keys %abbreviations) {
# Open output file (delete it if it already exists)
$output_file = $out_dir."/".$abbreviation."_set";
if(-e $output_file) { unlink($output_file); }
open(OUT, ">$output_file") or die "Can't open $output_file for writing\n";
$counter = 1;
# Go through and create the query
foreach $this_sense (sort {$a cmp $b} keys %{$abbreviations{$abbreviation}}) {
$query = "\"".$abbreviation."\"[TIAB] ";
foreach $sense (keys %{$abbreviations{$abbreviation}}) {
unless($sense eq $this_sense) {
$query .= " NOT ";
} # unless
$query .= $abbreviations{$abbreviation}{$sense}."[TIAB] ";
} # foreach
# Retrieve documents
&retrieveDocs($query, $abbreviation, $this_sense, $output_file);
} # foreach
close(OUT);
} # foreach
# Delete the temporary file to be neat
unlink($tmp_file);
#####################################################################################################
# SUBROTINES
#####################################################################################################
sub retrieveDocs {
my ($query, $target_abbreviation, $sense, $output_file) = @_;
# print STDERR "Abbreviation: $target_abbreviation Sense: $sense Query: $query\n";
# Check how many hits there are
my $esearch = "$utils/esearch.fcgi?" .
"tool=downloadCorpus.pl&db=$db&retmax=1&usehistory=y&term=";
my $esearch_result = get($esearch . $query);
my $Count;
my $QueryKey;
my $WebEnv;
unless(defined $esearch_result) {
$Count = 0;
} elsif($esearch_result =~
m|(\d+).*(\d+).*(\S+)|s) {
$Count = $1;
$QueryKey = $2;
$WebEnv = $3;
} else {
$Count = 0;
} # if/else
# Check there are some hits for the query
if($Count == 0) {
return 0;
} # if
# Download documents in batches of $batch_size
$retstart = 0;
$retmax = 0;
while($retmax < $Count && $retmax < $max_docs) {
$retmax = $retstart + $batch_size;
# print STDERR "Retrieving $retstart to $retmax\n";
if($retmax > $Count) {
$retmax = $Count;
} # if
if($retmax > $max_docs) {
$retmax = $max_docs;
} # if
# Print out these abstracts, and relax the query if we need more
my $efetch =
"$utils/efetch.fcgi?" .
"tool=genexamples&email=M.Stevenson\@dcs.shef.ac.uk&" .
"rettype=$report&retmode=text&retstart=$retstart&" .
"retmax=$retmax&" .
"db=$db&query_key=$QueryKey&WebEnv=$WebEnv";
# print STDERR "Asking for results ($retstart to $retmax)\n";
my $efetch_result = get($efetch);
# Get abstracts and update loop variables
&parse_results($efetch_result, $target_abbreviation, $sense, $output_file);
$retstart = $retmax;
} # while
return 1;
} # sub retrieveDocs
# Reads through XML output from Entrez and prints out each abstract
sub parse_results {
my ($search_result, $target_abbreviation, $sense, $output_file) = @_;
my $useful_abstracts = 0;
my $data = XMLin($search_result);
# Go through each article and extract relevant information
# How this is done depends on how many articles there are in the XML
# file (since different data structures are returned for each)
# If hash table is returned means that just one article
if(ref($data->{PubmedArticle}) eq "HASH") {
$pmid = $$data{PubmedArticle}{MedlineCitation}{PMID};
$title = $$data{PubmedArticle}{MedlineCitation}{Article}{ArticleTitle};
if(exists $$data{PubmedArticle}{MedlineCitation}{Article}{Abstract}{AbstractText}) {
$abstract =
$$data{PubmedArticle}{MedlineCitation}{Article}{Abstract}{AbstractText};
} else {
$abstract = "";
} # if/else
@mesh_terms = ();
if(exists $$data{PubmedArticle}{MedlineCitation}{MeshHeadingList}{MeshHeading}) {
$type = ref($$data{PubmedArticle}{MedlineCitation}{MeshHeadingList}{MeshHeading});
# If data type is ARRAY, means there is an array of hashes containing
# MeSH terms (process each one in turn)
if($type eq "ARRAY") {
foreach $mesh_term_hash (@{$$data{PubmedArticle}{MedlineCitation}{MeshHeadingList}{MeshHeading}}) {
if(exists $$mesh_term_hash{DescriptorName}{content}) {
push(@mesh_terms, $$mesh_term_hash{DescriptorName}{content});
} # if
} # foreach
} else {
# Otherwise the data type is hash and there is just one MeSH term to
# process
if(exists $$data{PubmedArticle}{MedlineCitation}{MeshHeadingList}{MeshHeading}{DescriptorName}{content}) {
push(@mesh_terms, $$data{PubmedArticle}{MedlineCitation}{MeshHeadingList}{MeshHeading}{DescriptorName}{content});
} # if
} # if/else
} # if
unless(exists $abstracts_found{$pmid}) {
&printAbstract($output_file, $target_abbreviation, $sense, $pmid, $title, $abstract, @mesh_terms);
} # unless
# If the data structure returned isn't a hash - means there is
# an array of hashes. Process each in turn
} else {
foreach $article (@{$data->{PubmedArticle}}) {
# Gather the information about this abstract
$pmid = $$article{MedlineCitation}{PMID};
$title = $$article{MedlineCitation}{Article}{ArticleTitle};
if(exists $$article{MedlineCitation}{Article}{Abstract}{AbstractText}) {
$abstract =
$$article{MedlineCitation}{Article}{Abstract}{AbstractText};
} else {
$abstract = "";
} # if/else
@mesh_terms = ();
# Need to check how many MeSH terms there are
if(exists $$article{MedlineCitation}{MeshHeadingList}{MeshHeading}) {
$type = ref($$article{MedlineCitation}{MeshHeadingList}{MeshHeading});
# If data type is ARRAY, means there is an array of hashes containing
# MeSH terms (process each one in turn)
if($type eq "ARRAY") {
foreach $mesh_term_hash (@{$$article{MedlineCitation}{MeshHeadingList}{MeshHeading}}) {
if(exists $$mesh_term_hash{DescriptorName}{content}) {
push(@mesh_terms, $$mesh_term_hash{DescriptorName}{content});
} # if
} # foreach
} else {
# Otherwise the data type is hash and there is just one MeSH term to
# process
if(exists $$article{MedlineCitation}{MeshHeadingList}{MeshHeading}{DescriptorName}{content}) {
push(@mesh_terms, $$article{MedlineCitation}{MeshHeadingList}{MeshHeading}{DescriptorName}{content});
} # if
} # if/else
} # if
unless(exists $abstracts_found{$pmid}) {
&printAbstract($output_file, $target_abbreviation, $sense, $pmid, $title, $abstract, @mesh_terms);
} # unless
} # foreach
} # if/else
return $useful_abstracts;
} # sub parse_results
# sub printAbstract
# Purpose:
# Arguments: Three strings (PubMed ID, Title, Abstract)
# Returns: 1 if abstract could be successfully printed and 0 otherwise
sub printAbstract {
my ($output_file, $target_abbreviation, $sense, $pmid, $title, $abstract, @mesh_terms) = @_;
# print STDERR "printAbstract for $pmid\n";
# Don't print this one if it's already been downloaded
if(exists $downloaded_abstracts{$target_abbreviation}{$pmid}) {
return 0;
} # if
# Deal with cases where abstracts don't have sentence breaks at the end
# (by adding one)
if($title =~ /^\[(.+)\]$/) {
$title = $1;
} # if
unless($title =~ /[\.\!\?]$/) {
$title = $title.".";
} # unless
$context = "PMID - $pmid\nTI - $title\nAB - $abstract\n";
# print STDERR "\$context $context\n";
my $expansion = &findExpansion($target_abbreviation, $context);
# print STDERR "\$expansion -- $expansion\n";
if($expansion eq "none_found") {
# print STDERR "\tno expansion found\n";
return 0;
} else {
# Clear out the data structures
$pre_target = "";
$post_target = "";
$local_pre_target = "";
$local_post_target = "";
# Try to find abbreviation and expansion in text. If we can then substitute it with just the
# abbreviation in the text. If we can't find it then return 0.
if($context =~ /(.+?)(($expansion|$target_abbreviation) +\(($expansion|$target_abbreviation)\))(.+)/is) {
$pre_target = $1;
$target = $target_abbreviation;
$post_target = $5;
$substituted_context = $pre_target.$target.$post_target;
} else {
# print STDERR "\tno pattern match\n";
return 0;
} # if/else
# Process abstract
# (1) identify target word
# (2) find its offsets
# (3) identify context around target
# (4) adjust offsets
# (5) print out text in reformatted set
$wide_start_offset = length $pre_target;
$wide_end_offset = $wide_start_offset + length $target;
$wide_end_offset--;
# Look for sentence context around target
# Start with context before target
# Easiest way to do this is to scan from the start of the context
# until the start of the current sentence is found
$local_pre_target = "";
while($pre_target =~ /(.*?([\.\?\!]|$))/gs) {
$local_pre_target .= $1;
# Don't let the sentence end as part of a contraction (e.g.)
# or decimal expression
if($pre_target =~ /(mg\.\Gkg|St\.\G|e\.\Gg\.|i\.\Ge\.|e\.g\.\G|i\.e\.\G|[0-9]\.\G[0-9])/) {
# not the end of the sentence -- keep collecting context
} elsif($pre_target =~ /[\.\?\!]\G/) {
# real end of sentence. Clear sentence and keep going.
$local_pre_target = "";
} # if/else
} # while
# Clean up by trimming any leading whitespace
$local_pre_target =~ s/^[ \n]+//;
# Then look for post target
# Match anything (non-greedily) but don't let last punctuation
# be part of "i.e." or "e.g."
$local_post_target = "";
while($post_target =~ /(.*?[\.\?\!])/gs) {
$local_post_target .= $1;
# Don't end the sentence as part of a contraction (e.g.) or
# decimal expression
if($post_target =~ /(mg\.\Gkg|St\.\G|i\.\Ge\.|e\.\Gg\.|i\.e\.\G|e\.g\.\G|[0-9]\.\G[0-9])/) {
# continue
} else {
last;
} # if/else
} #while
$local_start_offset = length $local_pre_target;
$local_end_offset = $local_start_offset + length $target;
$local_end_offset--;
$local_context = $local_pre_target.$target.$local_post_target;
# Print out context in reformatted version
print OUT "$counter|$pmid.xx.x|$sense\n";
$counter++;
print OUT "$local_context\n";
print OUT "$target_abbreviation|$target|xx|xx|$local_start_offset|$local_end_offset|xxxxx|\n";
print OUT $substituted_context;
print OUT "$target_abbreviation|$target|xx|xx|$wide_start_offset|$wide_end_offset|xxxxx|\n";
# Print out MeSH terms (if there are any)
foreach $mesh_term (@mesh_terms) {
print OUT "MeSH: $mesh_term\n";
} # foreach
print OUT "\n\n";
# Note that abstract has been downloaded
$downloaded_abstracts{$target_abbreviation}{$pmid} = "true";
} # if/else
return 1;
} # sub printAbstract
# sub findExpansion
# Purpose: Calls Hearst's ExtractAbbrev to identify abbreviations and associated expansions in a
# text and returns expansion for the abbreviation we're interested in
# ($target_abbreviation). If it can't find one for the target abbreviation then
# "none_found" is returned. Also, on some occasions there is more than one possible
# expansion for an abbreviation. The number of defined expansions for each abbreviation
# is stored and if there is more than one for the abbreviation we're interested in then
# "none_found" is also returned.
sub findExpansion {
my ($target_abbreviation, $text)= @_;
# print STDERR "Calling findExpansion with \$target_abbreviation \$sense $sense\n";
open(TMP, ">$tmp_file") or die "Can't open temporary file ($tmp_file) for writing\n";
print TMP $text;
close(TMP);
my %extractabbrev_out = ();
my %abbrevambiguity = ();
open(PIPE, "java -classpath /share/nlp/projects/LexDis/tools/ExtractAbbrev/ ExtractAbbrev $tmp_file |")
or die "Can't open pipe for reading\n";
while() {
if(/^([A-Z]+)\s+(.+)/) {
$abbrev = $1;
$expansion = $2;
$extractabbrev_out{$abbrev} = $expansion;
$abbrevambiguity{$abbrev}++;
} # if
} # while
if(exists $extractabbrev_out{$target_abbreviation}) {
if($abbrevambiguity{$target_abbreviation} > 1) {
return "non_found";
} # if
$expansion = $extractabbrev_out{$target_abbreviation};
#print "Returning $expansion\n";
return quotemeta $expansion;
} else {
return "none_found"
} # if/else
} # sub findExpansion
# sub usage
# Prints out usage notes
sub usage {
die("$0 -d dir -h data_file\n\t-d out_dir\tdirectory in which output files are written (default to \"corpus/\")\n\t-h\t\tprints out these usage notes\n");
} # sub usage