#!/usr/bin/perl -w =head1 NAME raw_file_maker.pl -- convert CEL files to raw format =head1 SYNOPSIS B Input CEL files should be in text format. Two output files per input CEL file are created (.raw and .mask files). The output files are put in the same directory as the original CEL file. The script will also optionally draw heatmaps and outlier images in PNG format. Input arguments can be any combination of directories of CEL files or names of CEL files. The CEL files should represent Tag3 or Tag4 data. =cut ############################################################################## =head1 REQUIRES FileHandle, File::Basename, File::Spec, Getopt::Long, Pod::Usage, GD (only when drawing heatmaps), =cut use FileHandle; use File::Basename; use File::Spec; use Getopt::Long; require Pod::Usage; use strict; ############################################################################## ### CONSTANTS my $PROGRAM = $0; $PROGRAM =~ s;^.*/;;; my $VERSION = '$Revision: 36 $'; $VERSION =~ s/^\$Revision: //; $VERSION =~ s/\$$//; $VERSION = 0 unless $VERSION; $VERSION = sprintf "%3.1f", $VERSION/10; my $CHIPINFO = { 70756 => { ct=>'Tag3', stf=>'straintag_Tag3.tab', keyf=>'taginfo_Tag3.tab', min_cv => 100, X=> 266, Y=> 266 }, # Range 0-265 # cols=>[qw/intensity/] }, 101124 => { ct=>'Tag4', stf=>'straintag_Tag4.tab', keyf=>'taginfo_Tag4.tab', min_cv => 0.3, X=> 318, Y=> 318 }, # Range 0-317 # cols=>[qw/copy1 copy2 copy3 copy4 copy5/] }, }; sub default_keydir { my ($base, $dir, $suff) = fileparse($0); my $keydir = File::Spec->catfile($dir, "files"); return (-d $keydir) ? $keydir : undef; } ############################################################################## =head1 DESCRIPTION This script reformats CEL data by mapping X, Y coordinates to strain tags based on information in two files. Outliers are also detected and optional heatmap images can be created. =head2 Input data files The input files should be CEL files from Tag3 or Tag4 chips. The script determines the type of chip based on the number of data points in the file (Tag3 has 70756 points; Tag4 has 101124 points). =head2 Key files The key files are tab-delimited text files used to relate the X, Y coordinates in the CEL file to deletion strain tags. The names of the files are hardcoded in the script. =over 2 =item * B -- Maps X, Y coordinates to oligos. The file should contain the following columns: X, Y, qualifier, oligo, tag_type (optional). The rows should include only the tags of interest; the oligo entry can be in either the sense and antisense direction. The taginfo file should be named "taginfo_Tag3.tab" for Tag3 chips and "taginfo_Tag4.tab" for Tag4 chips. =item * B -- Maps strains to oligos. The file should contain the following columns: strain, uptag_oligo, downtag_oligo (optional). The straintag file should be named "straintag_Tag3.tab" or "straintag_Tag4.tab", where is optional and refers to a pool that can be specified using the --pool option. =back =head2 Output text files Information is grouped by strain tags. In Tag4 chips, the strain tags are defined by orf::batch::(uptag,downtag) and there are 5 values per strain tag. In Tag3 chips, the strain tags are defined by orf::batch:(uptag,downtag):sense and each strain tag has only one value. The .raw file includes an additional column of the average of the strain tag intensities excluding any outliers. =over 2 =item * B<.raw> -- For Tag4, the columns are: strain_tag, copy1, copy2, copy3, copy4, copy5, raw_average. The "raw_average" is the average of the 5 tag copies excluding outliers. For Tag3, the columns are: strain_tag, intensity, raw_average. The "raw_average" is the intensity or undefined if the value is an outlier. =item * B<.mask> -- For Tag4, the columns are: strain_tag, copy1, copy2, copy3, copy4, copy5. For Tag3, the columns are: strain_tag, masked. Entries are 0 or 1, where 1 means the data is an outlier. =head2 Outlier detection Points are marked as outliers when the standard deviation (from the Affy CEL file) divided by the raw intensity is greater than some threshold. In Tag4 chips, the optimal threshold was determined by Sarah to be 0.3; for Tag3 chips, no optimal threshold has been determined, so I arbitrarily set it at 2. Outlier dense regions are also marked in Tag4 chips. These regions are found by comparing the intensities of the 5 tag copies to their median value and flagging tags that are very different from the median. If flagged tags clump together, then all tags in the region are marked as outliers. =cut ############################################################################# =head1 OPTIONS =over 4 =item B<--imgdir> Generate images (PNG format) of heatmaps and masked regions and put the files in the directory specified. (Tag4 chips only) =item B<--od> =item B<--outdir> Save all output text files to this directory. The default is to save the output files in the same directory as the original CEL file. =item B<--keydir> Directory to find key files that describe the tags on the Tag3 and/or Tag4 chip. =item B<--man> Manual page. =back =head1 MORE OPTIONS =over 4 =item B<--patt> Limit input to files matching . =item B<--pool> Limit output to only strains corresponding to this pool. If no strain/tag info is found for the given pool, then the default (all yeast deletion strains) will be used. =item B<--noforce> Do not overwrite any existing files. =item B<--noraw> Do not write .raw or .mask files. =item B<--shiftbg> Subtract background from raw average. =item B<--v> Verbose output. =back =cut sub setup { my $KEYDIR = default_keydir(); unless (@ARGV) { print STDERR "\nDefault keydir = \"$KEYDIR\"\n\n"; Pod::Usage::pod2usage(-verbose => 1); } my $man = 0; my $help = 0; Getopt::Long::Configure('pass_through'); Getopt::Long::GetOptions('help|?' => \$help, man => \$man); Pod::Usage::pod2usage(-verbose => 1, -exitstatus => 0) if $help; Pod::Usage::pod2usage(-verbose => 2, -exitstatus => 0) if $man; my $p = { KEYDIR => undef, IMGDIR => undef, OUTDIR => undef, POOL => undef, PATT => '\w', FORCE => 1, RAW => 1, TAB => 0, DBUG => 0, TAGINFO => {}, MIN_INTENSITY => 100, SHIFTBG => 0, UNUSED => 0, V => 0, CMD => 'REPLACE', }; Getopt::Long::Configure('no_pass_through'); Getopt::Long::GetOptions( 'version' => sub { print "$PROGRAM, version $VERSION\n\n"; Pod::Usage::pod2usage(-verbose => 0, -exitstatus => 0) }, 'keydir=s' => \$$p{KEYDIR}, 'v' => \$$p{V}, 'imgdir=s' => \$$p{IMGDIR}, 'od|outdir=s' => \$$p{OUTDIR}, 'pool=s' => \$$p{POOL}, 'patt=s' => \$$p{PATT}, 'force!' => \$$p{FORCE}, 'raw!' => \$$p{RAW}, 'shiftbg!' => \$$p{SHIFTBG}, 'tab' => \$$p{TAB}, 'dbug' => \$$p{DBUG}, 'cmd=s' => \$$p{CMD}, 'unused' => \$$p{UNUSED}, ) || die "\n"; my $infiles = get_input_files($p, @ARGV); unless ($infiles && %$infiles) { die "No files to process.\n"; } unless ($$p{KEYDIR}) { if ($KEYDIR) { $$p{KEYDIR} = $KEYDIR } else { die "Need keyfile directory to find tag info.\n"; } } if ($$p{IMGDIR}) { require GD; } ($infiles, $p); } sub get_input_files { my $p = shift; my %infiles; my @suf = qw/.cel .CEL/; my $ext = join("|", map { quotemeta($_) } @suf); foreach my $fp (@_) { if (-d $fp) { # dir of CEL opendir(DIR, $fp) || die "Dir $fp: $!"; my @files = grep(/$ext$/, readdir(DIR)); closedir(DIR); foreach my $f (@files) { my $base = basename($f, @suf); $infiles{$base} = File::Spec->catfile($fp, $f); } } elsif (-f $fp && $fp =~ /$ext$/) { # CEL file my $base = basename($fp, @suf); $infiles{$base} = $fp; } elsif (-f $fp && $fp =~ /\.list$|\.txt$/) { my $fh = new FileHandle $fp or die "$fp: $!"; my @lines = grep(!/^#|^filename/ && s/[\r\n]*$//, $fh->getlines); $fh->close; foreach my $f (@lines) { $f =~ s/^\s+//; my @f = split(/ *\t */, $f); next unless $f[0]; my $base = basename($f[0], @suf); $infiles{$base} = $f; } } elsif ($$p{DBUSERAW}) { $infiles{$fp} = 'db'; } } # foreach my $b (sort keys %infiles) { print "$b -> $infiles{$b}\n"; } return %infiles ? \%infiles : undef; } ############################################################################## ### MAIN PROGRAM my ($infile, $p) = setup(); my $numcel = my @celfiles = sort grep(/$$p{PATT}/, keys %$infile); print STDERR "Got $numcel files to process.\n" if $$p{V}; my $num = 0; foreach my $cel (@celfiles) { printf STDERR "%d/%d) Processing $cel\n", ++$num, $numcel; my ($celdata, $cinfo) = get_cel_data($$infile{$cel}, $cel, $p); unless ($cinfo) { print STDERR "$cel -- $celdata"; next }; my $tinfo = get_tag_info($cel, $p, $cinfo); unless ($tinfo) { print STDERR "$cel -- could not get tag info\n"; next }; if ($$cinfo{ct} eq 'Tag4') { @{ $$cinfo{cols} } > 1 ? find_defects_Tag4($celdata, $tinfo, $cinfo, $p) : find_defects($celdata, $tinfo, $cinfo, $p); if ($$p{IMGDIR}) { add_outlines($celdata, 0.5); my $hmf = File::Spec->catfile($$p{IMGDIR}, "$cel-heatmap.png"); create_heatmap($celdata, $hmf, $cinfo, 'heatmap'); my $oif = File::Spec->catfile($$p{IMGDIR}, "$cel-masked.png"); create_heatmap($celdata, $oif, $cinfo, 'outlier image'); } } else { my $ct = $$cinfo{ct}; find_defects($celdata, $tinfo, $cinfo, $p); if ($$p{IMGDIR}) { my $hmf = File::Spec->catfile($$p{IMGDIR}, "$cel-heatmap.png"); create_heatmap($celdata, $hmf, $cinfo, 'rainbow heatmap'); } } if ($$p{RAW}) { my $outf = outfile_name("$cel.raw", $$infile{$cel}, $p); write_raw_data($outf, $celdata, $tinfo, $cinfo, $p, $cel); my $masf = outfile_name("$cel.mask", $$infile{$cel}, $p); write_masked_data($masf, $celdata, $tinfo, $cinfo, $p); } elsif ($$p{SAVEBG}) { write_raw_data(undef, $celdata, $tinfo, $cinfo, $p, $cel); } if ($$p{TAB}) { my $outf = outfile_name("$cel.tab", $$infile{$cel}, $p); write_tab_data($outf, $celdata, $p); } if ($$p{DBUG}) { my $outf = outfile_name("$cel.dbug", $$infile{$cel}, $p); my $covf = outfile_name("$cel-cov.dbug", $$infile{$cel}, $p); write_dbug_data($outf, $covf, $celdata, $tinfo, $cinfo, $p); } } #----------------------------------------------------------------------------- sub get_cel_data { my $celfile = shift; my $exp = shift; my $p = shift; if (-f $celfile) { return parse_celfile($celfile, $p); } else { return "not a file\n"; } } sub parse_celfile { my $file = shift; my $p = shift; # info_init is a string marking the beginning of cel file information # info_end is a string marking the end of cel file information my $info_init = qr/^CellHeader/; my $info_end= qr/\[MASKS|^\s*$/; my $fh = new FileHandle $file or do { my $err = " $file: $!\n"; warn $err; return ($err) }; while (<$fh>) { last if /$info_init/; } my %celdata; while (<$fh>) { last if /$info_end/; s/^ *//; my ($x, $y, $inT, $sd) = split(/ *\t */, $_); my $key = xy_key($x, $y); $inT =~ s/\.0$//; $celdata{$key}{mean_intensity} = $inT; $celdata{$key}{stddev_intensity} = $sd; } $fh->close; my $ndat = scalar(keys %celdata); if ($$CHIPINFO{$ndat}) { printf STDERR " %s chip ($ndat data pts)\n", $$CHIPINFO{$ndat}{ct} if $$p{V}; } else { my $err = " Unknown chip type ($ndat data points)\n"; return ($err); } (\%celdata, $$CHIPINFO{$ndat}); } #----------------------------------------------------------------------------- # $tags{straintag}{tagcopy} = X_Y # Should cover all custom tags, used or unused sub get_tag_info { my $cel = shift; my $p = shift; my $cinfo = shift; my $ct = $$cinfo{ct}; return unless $ct; my $pool = ''; if ($$p{POOL}) { $pool = lc $$p{POOL} } elsif ($$p{HOST}) { my $dbh = get_dbh($p); ($pool) = HIP::DBsubs::get_pool($dbh, $cel); warn " Pool is $pool\n" if $$p{V}; } return $$p{TAGINFO}{$ct}{$pool} if $$p{TAGINFO}{$ct} && $$p{TAGINFO}{$ct}{$pool}; my ($taginfo, $cols); if ($$p{KEYDIR}) { $taginfo = get_tag_info_from_file($$p{KEYDIR}, $cinfo, $pool); } elsif ($$p{HOST}) { my $dbh = get_dbh($p); $taginfo = get_tag_info_from_db($dbh, $cinfo, $pool); } $$p{TAGINFO}{$ct}{$pool} = $taginfo; return $$p{TAGINFO}{$ct}{$pool}; } sub get_tag_info_from_file { my $kdir = shift; my $cinfo = shift; my $pool = shift; my $si = get_straintag_from_file($kdir, $cinfo, $pool); my $oi = get_oligo_info_from_file($kdir, $cinfo, $pool); my %tags; my %copies; foreach my $o (keys %$oi) { foreach my $xy (keys %{ $$oi{$o} }) { my $h = $$oi{$o}{$xy}; my $oligo = $$si{$o} ? $o : revcomp($o); if ($$si{$oligo}) { ($$h{up_dntag}) = keys %{ $$si{$oligo} }; $$h{strain} = $$si{$oligo}{$$h{up_dntag}}{strain}; $$h{zygosity} = $$si{$oligo}{$$h{up_dntag}}{zygosity}; } my ($strainTag, $copy) = strain_tag_and_tag_copy($h); $tags{$strainTag}{$copy} = $xy; $copies{$copy} = 'column'; if ($$h{zygosity}) { $tags{$strainTag}{zygosity} = $$h{zygosity}; $copies{zygosity} = 'extra'; } # print "$xy\t$strainTag\t$copy\t$o\n"; } } my @copies = sort grep($copies{$_} eq 'column', keys %copies); my @extras = sort grep($copies{$_} eq 'extra', keys %copies); $$cinfo{cols} = \@copies; $$cinfo{extras} = \@extras; (\%tags); } sub revcomp { my $oligo = shift; $oligo =~ tr/ATCG/TAGC/; $oligo = reverse $oligo; ($oligo); } sub get_straintag_from_file { my $kdir = shift; my $cinfo = shift; my $pool = shift; my $keyfile = File::Spec->catfile($kdir, $$cinfo{stf}); if ($pool) { my $pkeyfile = $keyfile; $pkeyfile =~ s/\.tab/$pool.tab/; if (-f $pkeyfile) { $keyfile = $pkeyfile } else { warn "Using all deletion strains for pool $pool\n" } } warn " Getting strain data from $keyfile\n" if $$p{V}; my $fh = new FileHandle $keyfile or die "$keyfile: $!"; my @lines = $fh->getlines; foreach (@lines) {s/[\r\n]+$//} $fh->close; my %si; my @fds = split(/ *\t */, shift @lines); foreach (@lines) { my @d = split(/ *\t */, $_); my %h = map { $_ => shift @d } @fds; my $up = $h{uptag_oligo}; my $dn = $h{downtag_oligo}; $si{$up}{uptag} = \%h if $up; $si{$dn}{downtag} = \%h if $dn; } (\%si); } sub get_oligo_info_from_file { my $kdir = shift; my $cinfo = shift; my $pool = shift; my $keyfile = File::Spec->catfile($kdir, $$cinfo{keyf}); if ($pool eq 'expr') { $keyfile =~ s/\.tab/expr.tab/; } warn " Getting $$cinfo{ct} tag data from $keyfile\n" if $$p{V}; my $fh = new FileHandle $keyfile or die "$keyfile: $!"; my @lines = $fh->getlines; foreach (@lines) {s/[\r\n]+$//} $fh->close; my %oi; my @fds = split(/ *\t */, shift @lines); foreach (@lines) { my @d = split(/ *\t */, $_); my %h = map { $_ => shift @d } @fds; my $oligo = $h{oligo}; my $xy = xy_key($h{X},$h{Y}); $oi{$oligo}{$xy} = \%h; } (\%oi); } sub strain_tag_and_tag_copy { my $h = shift; my $sTag = $$h{strain} && $$h{up_dntag} ? sprintf "%s:%s", $$h{strain}, $$h{up_dntag} : "unused-$$h{oligo}"; $sTag = "unusedBad-$$h{oligo}" if $$h{tag_type} && $$h{tag_type}=~/bad$/; my $tagCopy = "intensity"; if ($$h{qualifier} =~ /^(copy\d)/) { $tagCopy = $1 } elsif ($$h{qualifier} =~ /^tag.*_([as])t$/) { my %tt3 = (a=>'antisense',s=>'sense'); $sTag .= sprintf ":%s", $tt3{$1}; } ($sTag, $tagCopy); } sub get_tag_info_from_db { my $dbh = shift; my $cinfo = shift; my $pool = shift; warn " Getting $$cinfo{ct} tag data from database\n" if $$p{V}; my $sth = HIP::DBsubs::tag_info_sth($dbh, $$cinfo{ct}, $pool); my $ps = $pool ? HIP::DBsubs::pool_strain_hash($dbh, $pool, $$cinfo{ct}) : undef; my %tags; my %copies; while (my $h = $sth->fetchrow_hashref) { if ($pool && $$h{strain} && $ps && %$ps) { if (my $p = $$ps{$$h{strain}}) { $$h{zygosity} = $$p{zygosity} if $$p{zygosity}; } else { $$h{strain} = undef; } } my ($tag, $copy) = strain_tag_and_tag_copy($h); $tags{$tag}{$copy} = $$h{X_Y}; $copies{$copy} = 'column'; if ($$h{zygosity}) { $tags{$tag}{zygosity} = $$h{zygosity}; $copies{zygosity} = 'extra'; } #print "$$h{X_Y}\t$tag\t$copy\t$$h{tag_type}\n"; } my @copies = sort grep($copies{$_} eq 'column', keys %copies); my @extras = sort grep($copies{$_} eq 'extra', keys %copies); $$cinfo{cols} = \@copies; $$cinfo{extras} = \@extras; (\%tags); } #----------------------------------------------------------------------------- sub find_defects { my $celdata = shift; my $tinfo = shift; my $cinfo = shift; my $p = shift; warn " Finding defects\n" if $$p{V}; my $numout=0; my @xy = keys %$celdata; foreach my $s (keys %$tinfo) { foreach my $xy (values %{ $$tinfo{$s} }) { next unless defined($$celdata{$xy}) && defined($$celdata{$xy}{mean_intensity}); my $raw = $$celdata{$xy}{mean_intensity}; my $sd = $$celdata{$xy}{stddev_intensity}; my $isout = $raw && $sd/$raw > $$cinfo{min_cv} ? 1 : 0; $$celdata{$xy}{outlier} = $isout; $numout += $isout; } } printf STDERR " %5d outlier points\n", $numout if $$p{V}; } sub find_defects_Tag4 { my $celdata = shift; my $tinfo = shift; my $cinfo = shift; my $p = shift; warn " Finding defects\n"; my @outliers; my $numextout = 0; foreach my $s (keys %$tinfo) { my @reptagint = (); my @reptagref = (); # Get data for the 5 tag copies foreach my $xy (values %{ $$tinfo{$s} }) { next unless defined($$celdata{$xy}) && defined($$celdata{$xy}{mean_intensity}); push(@reptagint, $$celdata{$xy}{mean_intensity}); push(@reptagref, $xy); } # Average tags excluding highest and lowest values my $mn = tossHiLow_mean(@reptagint); # Omit from analysis if tag mean is too low next unless $mn && $mn > $$p{MIN_INTENSITY}; foreach my $xy (@reptagref) { my $raw = $$celdata{$xy}{mean_intensity}; my $sd = $$celdata{$xy}{stddev_intensity}; my $ratio = $raw/$mn; $$celdata{$xy}{raw_ratio} = sprintf "%8.6f", $ratio; my ($isout, $isextout) = is_outlier($raw, $sd, $ratio, $cinfo); push(@outliers, $xy) if $isout; $$celdata{$xy}{outlier} = $isextout; $numextout += $isextout; } } printf STDERR " %5d extreme outlier points\n", $numextout if $$p{V}; # Find outlier dense regions my $outlierdense = medianfilter(\@outliers, 5, 5); my $numod = @$outlierdense; printf STDERR " %5d outlier dense points\n", $numod if $$p{V}; # Include all points in the vicinity of the outlier dense regions my $maskingarea = disk_dilate6($outlierdense); my $nummask = @$maskingarea; printf STDERR " %5d points masked after dilation\n", $nummask if $$p{V}; # Mark all points in vicinity of outlier dense regions as outliers foreach my $xy (@$maskingarea) { $$celdata{$xy}{outlier} = 1; } my $nummasked = grep($$celdata{$_}{outlier}, keys %$celdata); printf STDERR " %5d total points masked\n", $nummasked if $$p{V}; } sub tossHiLow_mean { my @list = sort {$a<=>$b} @_; if (@list > 2) { shift @list; pop @list; } (mean(\@list)); } sub is_outlier { my $raw = shift; my $sd = shift; my $rr = shift; my $cinfo = shift; my $o = ($rr > 1.1 || $rr < 1/1.1) ? 1 : 0; my $eo = $raw && $sd/$raw > $$cinfo{min_cv} ? 1 : 0; ($o, $eo); } # Removes salt and pepper noise sub medianfilter { my $xypts = shift; my $xlen = shift; my $ylen = shift; my $xadj = int($xlen/2); my $yadj = int($ylen/2); my %grid; foreach my $xy (@$xypts) { my ($x, $y) = revkey($xy); my $minx = max([0, $x-$xadj]); my $maxx = min([317, $x+$xadj]); my $miny = max([0, $y-$yadj]); my $maxy = min([317, $y+$yadj]); for(my $ix=$minx; $ix<=$maxx; $ix++) { for(my $iy=$miny; $iy<=$maxy; $iy++) { my $ixy = xy_key($ix, $iy); $grid{$ixy}++; } } } my $mediannum = int(($xlen*$ylen)/2); my @mfxy = grep($grid{$_}>$mediannum, keys %grid); (\@mfxy); } sub disk_dilate6 { my $xypts = shift; my $diam = 6; my $adj = $diam + 2; my %grid; foreach my $xy (@$xypts) { my ($x, $y) = revkey($xy); my $minx = max([0, $x-$adj]); my $maxx = min([317, $x+$adj]); my $miny = max([0, $y-$adj]); my $maxy = min([317, $y+$adj]); for(my $ix=0; $ix<$diam; $ix++) { for(my $iy=0; $iy<$diam; $iy++) { next if ($ix+$iy)>$adj; my (@ix, @iy); push(@ix, $x+$ix) if $x+$ix<=317; push(@ix, $x-$ix) if $ix && $x-$ix>=0; push(@iy, $y+$iy) if $y+$iy<=317; push(@iy, $y-$iy) if $iy && $y-$iy>=0; foreach my $ix (@ix) { foreach my $iy (@iy) { my $ixy = xy_key($ix, $iy); $grid{$ixy} = 1; } } } } } my @dixy = grep($grid{$_}, keys %grid); (\@dixy); } sub add_outlines { my $data = shift; my $val = shift; warn " Outlining masked areas\n" if $$p{V}; my @xypts = grep($$data{$_}{outlier}, keys %$data); my (%x, %y); foreach my $xy (@xypts) { my ($x, $y) = revkey($xy); $x{$x} = 1; $x{$x-1} = 1; $x{$x+1} = 1; $y{$y} = 1; $y{$y-1} = 1; $y{$y+1} = 1; } my %sobel; my @G= ([-2,0,2],[-1,0,1]); foreach my $x (sort grep($_>=0 && $_<318, keys %x)) { foreach my $y (sort grep($_>=0 && $_<318, keys %y)) { my ($sumX, $sumY) = (0,0); #my $str = "xy ($x, $y) Gx: "; for(my $i=-1; $i<=1; $i++) { for(my $j=-1; $j<=1; $j++) { my $xval = $x+$i; next if $xval<0 || $xval > 317; my $yval = $y+$j; next if $yval<0 || $yval > 317; my $sxy = xy_key($xval, $yval); my $v = $$data{$sxy}{outlier} or next; $sumX += $v*$G[abs($i)][$j+1]; $sumY += $v*$G[abs($j)][$i+1]; } } my $xy = xy_key($x, $y); my $v = $$data{$xy}{outlier} || 0; $sobel{$xy} = $v ? 0 : $sumX*$sumX+$sumY*$sumY; } } foreach my $xy (grep($sobel{$_}>3, keys %sobel)) { $$data{$xy}{outline} = $val; } } #----------------------------------------------------------------------------- sub write_raw_data { my $outf = shift; my $celdata = shift; my $tinfo = shift; my $cinfo = shift; my $p = shift; my $cel = shift; my $run = shift; if ($outf && -f $outf && $$p{FORCE}) { unlink($outf) or do { warn "Could not remove $outf: $!\n"; return } } if ($outf && -f $outf) { print STDERR " $outf already exists.\n"; return; } print STDERR " Writing $outf\n" if $outf; my $cols = $$cinfo{cols}; my $exts = $$cinfo{extras}; my $ofh; $ofh = new FileHandle ">$outf" or die ">$outf: $!" if $outf; my @straintags = sort keys %$tinfo; my ($bg, $thresh, $bgsd, $num) = compute_background(\@straintags, $celdata, $tinfo, $cols); unless ($$p{UNUSED}) { @straintags = grep(!/unused/, @straintags); } print $ofh "# Results shifted by background\n" if $$p{SHIFTBG} && $ofh; printf $ofh "# Background = %7.4f +/- %7.4f", $bg, $bgsd if $ofh; if (!$$p{SHIFTBG}) { $bg = 0 } $thresh -= $bg; printf $ofh "\tThreshold = %7.4f\n", $thresh if $ofh; print $ofh join("\t", "strain_tag", @$cols, "raw_average", "above_thresh", @$exts)."\n" if $ofh; my (%saveavg, %savethresh); foreach my $s (@straintags) { my @good = (); my @entries = (); foreach my $k (@$cols) { my $xy = $$tinfo{$s}{$k}; my $v = $$celdata{$xy}{mean_intensity} if $xy; my $m = $xy && $$celdata{$xy}{outlier} ? $$celdata{$xy}{outlier} : 0; push(@entries, defined($v) ? $v : ''); push(@good, $v) if defined($v) && $m < 0.8; } my $avg = @good ? mean(\@good) - $bg : ''; $avg = 0 if $avg && $avg < 0; push(@entries, $avg); my $above_thresh = @good ? ($avg > $thresh ? 1 : 0) : ''; push(@entries, $above_thresh); foreach my $e (@$exts) { push(@entries, $$tinfo{$s}{$e} ? $$tinfo{$s}{$e} : ''); } print $ofh join("\t", $s, @entries)."\n" if $ofh; if (@good) { $saveavg{$s} = $avg; $savethresh{$s} = $above_thresh; } } $$p{AVG_INTEN} = \%saveavg; $$p{ABOVE_THRESH} = \%savethresh; $ofh->close if $ofh; (\%saveavg); } sub compute_background { my $straintags = shift; my $celdata = shift; my $tinfo = shift; my $tcols = shift; my @bg_values; foreach my $unusedTag (grep(/unused-/, @$straintags)) { foreach my $k (@$tcols) { my $xy = $$tinfo{$unusedTag}{$k}; next unless $xy; my $v = $$celdata{$xy}{mean_intensity}; my $m = $$celdata{$xy}{outlier} ? $$celdata{$xy}{outlier} : 0; push(@bg_values, $v) if defined($v) && $m < 0.8; } } my $bg = mean(\@bg_values); my $sd = stddev(\@bg_values); my $thresh = $bg+3*$sd; my @bg_noxhybe = grep($_<$thresh, @bg_values); $bg = mean(\@bg_noxhybe); $sd = stddev(\@bg_noxhybe); $thresh = $bg+2*$sd; my $num = scalar(@bg_noxhybe); printf STDERR " Background = %6.2f Thresh=%6.2f ($num unused tags)\n", $bg, $thresh; ($bg, $thresh, $sd, $num); } sub write_masked_data { my $outf = shift; my $celdata = shift; my $tinfo = shift; my $cinfo = shift; my $p = shift; if (-e $outf && $$p{FORCE}) { unlink($outf) or do { warn "Could not remove $outf: $!\n"; return } } if (-e $outf) { print STDERR " $outf already exists.\n"; return; } print STDERR " Writing $outf\n"; my $ofh = new FileHandle ">$outf" or die ">$outf: $!"; my $cols = $$cinfo{cols}; my $head = join("\t", "strain_tag", @$cols). "\n"; $head =~ s/intensity/masked/; print $ofh $head; my @straintags = sort keys %$tinfo; unless ($$p{UNUSED}) { @straintags = grep(!/unused/, @straintags); } foreach my $s (@straintags) { my @entries = (); foreach my $k (@$cols) { my $xy = $$tinfo{$s}{$k}; my $m = $$celdata{$xy}{outlier} || 0; push(@entries, $m < 0.8 ? 0 : 1); } my $avg = ''; print $ofh join("\t", $s, @entries), "\n"; } $ofh->close; } sub write_dbug_data { my $outf = shift; my $covf = shift; my $celdata = shift; my $tinfo = shift; my $cinfo = shift; my $p = shift; if (-e $outf && $$p{FORCE}) { unlink($outf) or do { warn "Could not remove $outf: $!\n"; return } } if (-e $outf) { print STDERR " $outf already exists.\n"; return; } print STDERR " Writing $outf\n"; my $ofh = new FileHandle ">$outf" or die ">$outf: $!"; print STDERR " Writing $covf\n"; my $cfh = new FileHandle ">$covf" or die ">$covf: $!"; my $cols = $$cinfo{cols}; my $add_avg = (@$cols > 1) ? 1 : 0; print $ofh join("\t", "strain_tag", @$cols); $add_avg ? print $ofh "\traw_average\n" : print $ofh "\n"; print $cfh join("\t", "strain_tag", @$cols), "\n"; my @straintags = sort keys %$tinfo; unless ($$p{UNUSED}) { @straintags = grep(!/unused/, @straintags); } my %hash; foreach my $s (@straintags) { my @good = (); my @cov = (); my @msd = (); my @entries = (); foreach my $k (@$cols) { my $xy = $$tinfo{$s}{$k}; my $v = $$celdata{$xy}{mean_intensity} if $xy; my $sd = $$celdata{$xy}{stddev_intensity} if $xy; my $cv = $v && $sd ? sprintf "%5.3f", $sd/$v : 0; my $m = $xy && $$celdata{$xy}{outlier} ? $$celdata{$xy}{outlier} : 0; push(@entries, defined($v) && $m < 0.8 ? $v : ''); push(@good, $v) if defined($v) && $m < 0.8; push(@cov, defined($v) ? $cv : ''); push(@msd, defined($v) ? $v : '', defined($sd) ? $sd : ''); } print $ofh join("\t", $s, @entries); print $cfh join("\t", $s, @cov, @msd), "\n"; if ($add_avg) { my $avg = ''; if (@good) { $avg = mean(\@good); } print $ofh "\t$avg"; if ($avg && $avg > 100) { for (my $i=0; $i<@cov; $i++) { my $cv = sprintf "%4.2f", $cov[$i]; my $raw = $msd[2*$i]; my $rr = max([$raw/$avg, $avg/$raw]); push(@{ $hash{$cv} }, $rr); } } } print $ofh "\n"; } # foreach my $k (sort keys %hash) { # my $l = $hash{$k}; # printf "%5s: max %6.3f mean %6.3f %4d pts\n", $k, max($l), mean($l), scalar(@$l); # printf STDERR "%5s: max %6.3f mean %6.3f %4d pts\n", $k, max($l), mean($l), scalar(@$l); # } $ofh->close; $cfh->close; } #----------------------------------------------------------------------------- sub create_heatmap { my $data = shift; my $pngf = shift; my $cinfo = shift; my $type = shift; print STDERR " Creating $type $pngf\n"; my $ofh = new FileHandle ">$pngf" or die ">$pngf: $!"; my $im = new GD::Image($$cinfo{X}, $$cinfo{Y}); my $c = allocate_colors($im, $type); my ($max, $min); if ($type eq "rainbow heatmap") { my @v = map { $$data{$_}{mean_intensity} } grep(defined($$data{$_}{mean_intensity}), keys %$data); $min = min(\@v); $max = max(\@v); } for(my $x=0; $x<$$cinfo{X}; $x++) { for(my $y=0; $y<$$cinfo{Y}; $y++) { my $xy = xy_key($x, $y); my $inT = $$data{$xy} ? $$data{$xy} : {}; my $val = color_value($c, $inT, $type, $max, $min); $im->filledRectangle($x, $y, $x+1, $y+1, $val); } } binmode($ofh); print $ofh $im->png; $ofh->close; } sub allocate_colors { my $im = shift; my $type = shift || ''; my %c = ( blk=>$im->colorAllocate(0,0,0) ); if ($type eq 'outlier image') { $c{wht} = $im->colorAllocate(255,255,255); $c{gry} = $im->colorAllocate(128,128,128); } elsif ($type eq 'heatmap') { $c{brt} = $im->colorAllocate(0,255,0); $c{brt1} = $im->colorAllocate(0,204,0); $c{brt2} = $im->colorAllocate(0,153,0); $c{brt3} = $im->colorAllocate(0,102,0); $c{dim} = $im->colorAllocate(255,0,0); $c{dim1} = $im->colorAllocate(204,0,0); $c{dim2} = $im->colorAllocate(153,0,0); $c{dim3} = $im->colorAllocate(102,0,0); $c{out} = $im->colorAllocate(0,0,178); $c{outbrt} = $im->colorAllocate(0,255,178); $c{outbrt1} = $im->colorAllocate(0,204,178); $c{outbrt2} = $im->colorAllocate(0,153,178); $c{outbrt3} = $im->colorAllocate(0,102,178); $c{outdim} = $im->colorAllocate(255,0,178); $c{outdim1} = $im->colorAllocate(204,0,178); $c{outdim2} = $im->colorAllocate(153,0,178); $c{outdim3} = $im->colorAllocate(102,0,178); } elsif ($type eq 'rainbow heatmap') { $c{c9} = $im->colorAllocate(hex2val('444477')); $c{c8} = $im->colorAllocate(hex2val('0000FF')); $c{c7} = $im->colorAllocate(hex2val('004499')); $c{c6} = $im->colorAllocate(hex2val('00FF44')); $c{c5} = $im->colorAllocate(hex2val('66FF22')); $c{c4} = $im->colorAllocate(hex2val('D0FF00')); $c{c3} = $im->colorAllocate(hex2val('FFCC00')); $c{c2} = $im->colorAllocate(hex2val('FE6600')); $c{c1} = $im->colorAllocate(hex2val('FF0000')); $c{blk} = $im->colorAllocate(0,0,0); } (\%c); } sub hex2val { my $hex = shift; my @x = split(//, $hex); my @v; while (@x) { my $h = join('', splice(@x,0,2)); my $v = hex($h); push(@v, $v); } (@v); } sub color_value { my $c = shift; my $h = shift; my $type = shift || ''; my $max = shift || 0; my $min = shift || 0; my $color = ''; if ($type eq 'outlier image') { my $o = $$h{outlier}; my $l = $$h{outline}; $color = 'gry' if $o; $color = 'wht' if $l; } elsif ($type eq 'heatmap') { my $o = $$h{outlier}; my $l = $$h{outline}; my $r = $$h{raw_ratio}; if ($l) { $color = 'out'; } if ($r) { if ($r > 1.2) { $color .= 'brt' } elsif ($r > 1.15) { $color .= 'brt1' } elsif ($r > 1.10) { $color .= 'brt2' } elsif ($r > 1.05) { $color .= 'brt3' } elsif ($r < 1/1.2) { $color .= 'dim' } elsif ($r < 1/1.15) { $color .= 'dim1' } elsif ($r < 1/1.10) { $color .= 'dim2' } elsif ($r < 1/1.05) { $color .= 'dim3' } } } elsif ($type eq 'rainbow heatmap') { my $v = 10*($$h{mean_intensity} - $min)/$max; my $lv = $v > 0 ? log($v) : 0; # print "$$h{mean_intensity}\t$v\t$lv\n"; if ($v && $v > 0.15) { if ($v > 7.25) { $color .= 'c1' } elsif ($v > 5.5) { $color .= 'c2' } elsif ($v > 4) { $color .= 'c3' } elsif ($v > 2.75) { $color .= 'c4' } elsif ($v > 1.75) { $color .= 'c5' } elsif ($v > 1) { $color .= 'c6' } elsif ($v > 0.5) { $color .= 'c7' } elsif ($v > 0.25) { $color .= 'c8' } else { $color .= 'c9' } } } $color = 'blk' unless $color; ($$c{$color}); } #----------------------------------------------------------------------------- sub write_tab_data { my $outf = shift; my $celdata = shift; my $p = shift; print STDERR " Writing $outf\n"; my $ofh = new FileHandle ">$outf" or die ">$outf: $!"; my $xykeys = sort_xy([keys %$celdata]); my @cols = qw/mean_intensity stddev_intensity outlier/; print $ofh join("\t", "X_Y", @cols), "\n"; foreach my $xy (@$xykeys) { my @e = (); foreach my $k (@cols) { push(@e, defined($$celdata{$xy}{$k}) ? $$celdata{$xy}{$k} : ''); } print $ofh join("\t", $xy, @e), "\n"; } $ofh->close; } ##---------------------------------------------------------------------------- sub xy_key { my $x = shift; return unless defined($x); my $y = shift; return unless defined($y); my $xy = sprintf "%d_%d", $x, $y; ($xy); } sub revkey { my $xy = shift; my ($x, $y) = split(/_/, $xy); ($x, $y); } sub sort_xy { my $xy = shift; # return ([ sort @$xy ]); my @xy = sort { my $ka = sprintf "%03d.%03d", revkey($a); my $kb = sprintf "%03d.%03d", revkey($b); $ka<=>$kb; } @$xy; return \@xy; } ##---------------------------------------------------------------------------- sub outfile_name { my $outfile = shift; my $file = shift; my $p = shift; my $path; if ($$p{OUTDIR}) { $path = $$p{OUTDIR}; } else { # Return output file name in same directory as given file. my ($b, $p) = fileparse($file); $path = $p; } return File::Spec->catfile($path, $outfile); } sub print_note { my $fh = shift || *STDERR; my $count = shift; my $opts = shift || {}; my $divbg = $$opts{divbg} || 10000; my $divsm = $$opts{divsm} || 1000; my $char = $$opts{char} || '.'; if (!($count % $divbg)) { print $fh "$count"; } elsif (!($count % $divsm)) { print $fh $char; } } ##---------------------------------------------------------------------------- sub sum { my $l = shift; return unless $l && @$l; return $$l[0] unless @$l > 1; my $sum; foreach(@$l) { $sum+= $_; } return $sum; } sub median { my $l = shift; return unless $l && @$l; return $$l[0] unless @$l > 1; my @l = sort{$a<=>$b}@$l; return $l[$#l/2] if @l&1; my $mid= @l/2; return ($l[$mid-1]+$l[$mid])/2; } sub mean { my $l = shift; return unless $l && @$l; return $$l[0] unless @$l > 1; return sum($l)/scalar(@$l); } sub stddev { my $list = shift; return unless ref($list) eq 'ARRAY' && @$list; return 0 if @$list == 1; my $m = mean($list); my $nume = 0; foreach (@$list) { $nume += ($_-$m)**2; } return sqrt($nume/$#$list); } sub min { my $l = shift; return unless $l && @$l; return $$l[0] unless @$l > 1; my ($mx) = sort {$a<=>$b} @$l; ($mx); } sub max { my $l = shift; return unless $l && @$l; return $$l[0] unless @$l > 1; my ($mx) = reverse sort {$a<=>$b} @$l; ($mx); } ##---------------------------------------------------------------------------- =head1 COPYRIGHT Copyright 2007 The Board of Trustees of the Leland Stanford Junior University. All Rights Reserved. =cut