#!/usr/bin/perl -w =head1 NAME fitness_profile.pl -- compare treatment to control data =head1 SYNOPSIS B The arguments are the names of directories of data files representing the controls and treatment data, respectively. The data files should be the .norm files generated by C. B --ctrlset Alternatively, use the --ctrlset option and specify a file representing the control set data. =cut ############################################################################# =head1 REQUIRES FileHandle, File::Basename, File::Spec, Getopt::Long, Pod::Usage, Statistics::Distributions (only when calculating p-values), GD (only when drawing images), =cut use FileHandle; use File::Basename; use File::Spec; use Getopt::Long; require Pod::Usage; use strict; ############################################################################# ### CONSTANTS my $PROGRAM = $0; $PROGRAM =~ s;^.*/;;; my $VERSION = '$Revision: 38 $'; $VERSION =~ s/^\$Revision: //; $VERSION =~ s/\$$//; $VERSION = 0 unless $VERSION; $VERSION = sprintf "%3.1f", $VERSION/10; my $FONT = find_font("/usr/X11R6/lib/X11/fonts/TTF/luxisr.ttf"); sub find_font { my $default = shift; return $default if -f $default; foreach my $path (qw/C:\WINDOWS\Fonts C:\WINNT\Fonts/) { if (-d $path) { my $ft = File::Spec->catfile($path, "ARIAL.TTF"); return $ft if -f $ft; } } } # analysis types my $CC = "Ratio"; my $ZS = "Z-score"; my %DB_COLS = ( norm_quantile=> { avg_normq=> ['norm_quantile', $CC, 'avg'], stddev_normq=>['norm_quantile', $CC, 'stddev'], avg_lognq=> ['norm_quantile', $ZS, 'avg'], stddev_lognq=>['norm_quantile', $ZS, 'stddev'], avg => { $ZS=> 'avg_lognq', $CC=> 'avg_normq' }, stddev=>{ $ZS=> 'stddev_lognq', $CC=> 'stddev_normq' }, CV => { $ZS=> 'CV normq (log)', $CC=> 'CV normq' }, GOOD_TAG=> { $ZS=>'GOOD_TAG normq (log)', $CC=>'GOOD_TAG normq' }, }, norm_mean=> { avg_normm=> ['norm_mean', $CC, 'avg'], stddev_normm=>['norm_mean', $CC, 'stddev'], avg_lognm=> ['norm_mean', $ZS, 'avg'], stddev_lognm=>['norm_mean', $ZS, 'stddev'], avg => { $ZS=> 'avg_lognm', $CC=> 'avg_normm' }, stddev=>{ $ZS=> 'stddev_lognm', $CC=> 'stddev_normm' }, CV => { $ZS=> 'CV normm (log)', $CC=> 'CV normm' }, GOOD_TAG=> { $ZS=>'GOOD_TAG normm (log)', $CC=>'GOOD_TAG normm' }, } ); # float zero defined as abs(value)< $ZERO my $ZERO = 0.0001; ############################################################################# =head1 DESCRIPTION This script compares a set of treatment data to control data and assigns fitness defect scores to each deletion strain represented. Two methods for calculating a strain's fitness are implemented: log ratios and z-scores. Log ratios are calculated for each deletion strain tag as log2(avg(i_c)/avg(i_t)) where avg(i_c) is the average normalized intensity of the tag across the control data and avg(i_t) is the average normalized intensity of the tag across the treatment data. Z-scores are calculated for each deletion strain tag as (avg(l_c)-avg(l_t))/stddev(l_c) where avg(l_c) is the average of the log2 normalized intensities of the tag across the control data, avg(l_t) is the average of the log2 normalized intensities of the tag across the treatment data, and stddev(l_c) is the standard deviation of the log2 normalized intensities for the tag in the control data. Z-scores can be computed for a tag if the stddev(l_c) is not zero. Each deletion strain may have up to two tags: an uptag and a downtag. The final fitness defect score for each strain is the average of the uptag and downtag log ratios or z-scores if both are present. =head2 Output files The default output file is a tab-delimited file containing the z scores where the rows are deletion strains/genes and the columns are individual experiments. =cut ############################################################################# =head1 OPTIONS =over 4 =item B<--ratio> Calculate ratios. The ratio calculated for each tag is log2(ctrl/treat). =item B<--z> Calculate z-scores. =item B<--pvalue> Also calculate p-values and q-values for z-scores. =item B<--keyfile> Strain info file containing strain, gene, essential gene, feature type and GO information. =item B<--imgdir> Create images of analysis results and save in directory . =item B<--of> =item B<--outfile> Save results to output file. (Default: results.txt) =back =head1 MORE OPTIONS =over 4 =item B<--sing> Analyze treatment experiments singly instead of averaging them together. =item B<--thresh> Fraction of control tags above threshold to be considered a good tag. (Default: 0.75) =item B<--minint> Minimum intensity value of a control tag. Any control tag with a lower average value will be omitted from the analysis. (Default: 0) =item B<--maxcv> Maximum value of the coefficent of variation (CV) for a control tag, where CV = stddev/mean. Any control tag with a larger CV will be omitted from the analysis. (Default: 100) =item B<--rank> Include ranking of results in output. =item B<--nonm> =item B<--nonormmean> Do not run analyses using norm_mean values. =item B<--nonq> =item B<--nonormquan> Do not run analyses using norm_quantile values. =item B<--ctrlset> Specify a control set file in place of a control directory or list of files. =item B<--savectrlset> Save control set data to . =item B<--savevals> Print output files of intermediate values. =item B<--alltags> Also print results for individual tags (uptag, downtag...). =item B<--fudge> Add 90th percentile fudge factor to Z-scores. =back =head1 RATIO OPTIONS Modify the basic ratio: log2((ctrl-bg)/(treat-bg))/e^(cfx*(ctrl-bg)). =over 4 =item B<--bg> Assumed background value. This value will be subtracted from all tag intensities before calculating z-scores or ratios. (Default: 0) =item B<--cfx> Correction factor. Used when calculating ratios to correct for saturation effects that cause the calculated ratio to be smaller than the true ratio as intensities increase. (Default: 0) =back =cut sub setup { Pod::Usage::pod2usage(-verbose => 1) unless @ARGV; my $man = 0; my $help = 0; Getopt::Long::Configure('pass_through'); Getopt::Long::GetOptions('help|?' => \$help, man => \$man); Pod::Usage::pod2usage(-verbose => 1, -exitstatus => 0) if $help; Pod::Usage::pod2usage(-verbose => 2, -exitstatus => 0) if $man; my $p = { ZS => 0, PVAL => 0, RATIO => 0, SING => 0, KEYFILE => default_keyfile(), IMGDIR => undef, LABEL => '', SING => 0, THRESH => 0.75, MININT => 0, MAXCV => 100, ADD_RANK => 0, ADD_SIG => 0, NMEAN => 1, NQUAN => 1, CS => 0, CSSAVE => undef, VSAVE => undef, DBUG => undef, ALLTAGS => 0, RESULTTAGS => 1, FUDGE => 0, BG => 0, CFX => 0, OUTFILE => undef, ANALYSES => [], FIELDS => [], SUFF => '.norm', MINCTRLS => 2, FONT => $FONT, C_NUM => 0, T_NUM => 0, C_EXPS => undef, T_EXPS => undef, }; Getopt::Long::Configure('no_pass_through'); Getopt::Long::GetOptions( 'version' => sub { print "$PROGRAM, version $VERSION\n\n"; Pod::Usage::pod2usage(-verbose => 0, -exitstatus => 0) }, 'z' => \$$p{ZS}, 'pvalue' => \$$p{PVAL}, 'ratio' => \$$p{RATIO}, 'keyfile=s' => \$$p{KEYFILE}, 'imgdir=s' => \$$p{IMGDIR}, 'of|outfile=s' => \$$p{OUTFILE}, 'sing' => \$$p{SING}, 'thresh=f' => \$$p{THRESH}, 'minint=i' => \$$p{MININT}, 'maxcv=f' => \$$p{MAXCV}, 'rank' => \$$p{ADD_RANK}, 'fudge' => \$$p{FUDGE}, 'nm|normmean!' => \$$p{NMEAN}, 'nq|normquan!' => \$$p{NQUAN}, 'ctrlset' => \$$p{CS}, 'savectrlset=s' => \$$p{CSSAVE}, 'savevals=s' => \$$p{VSAVE}, 'dbug' => \$$p{DBUG}, 'alltags' => \$$p{ALLTAGS}, 'result!' => \$$p{RESULTTAGS}, 'bg=i' => \$$p{BG}, 'cfx=f' => \$$p{CFX}, ) || die "\n"; @ARGV || die "Need data to process.\n"; my $c_in = shift @ARGV; my @e_in = @ARGV; if ($$p{PVAL}) { require Statistics::Distributions; } push(@{ $$p{FIELDS} }, 'norm_mean') if $$p{NMEAN}; unshift(@{ $$p{FIELDS} }, 'norm_quantile') if $$p{NQUAN} || !$$p{NMEAN}; push(@{ $$p{ANALYSES} }, $ZS) if $$p{ZS}; push(@{ $$p{ANALYSES} }, $CC) if $$p{RATIO}; @{ $$p{ANALYSES} } ? print STDERR "\nAnalyses: ".join(", ",@{ $$p{ANALYSES} })."\n" : die "Need to specify analysis type: --z and/or --ratio\n"; if ($$p{IMGDIR}) { require GD; warn "Image directory: $$p{IMGDIR}\n"; } my %input; if ($$p{CS}) { warn "Input controls: $c_in\n"; if ($$p{CS}) { -f $c_in or die "$c_in file not readable\n" } $input{control} = { INPUT=>$c_in }; } else { warn "Reading input controls: $c_in\n"; $$p{C_NUM} = read_input($c_in, 'control', $$p{SUFF}, \%input); } unless ($$p{OUTFILE}) { $$p{OUTFILE} = 'results.txt' ; warn "Results will be written to $$p{OUTFILE}\n"; } %input || die "No control data.\n"; for(my $i=0; $i<@e_in; $i++) { my $e = $e_in[$i]; my $label = @e_in>1 ? sprintf "treat%02d", $i+1 : "treat"; $$p{T_NUM} += read_input($e, $label, $$p{SUFF}, \%input, $p, $$p{SING}); } (\%input, $p); } sub read_input { my $arg = shift; my $label = shift; my $suf = shift; my $hash = shift || {}; my $p = shift || {}; my $sing = shift || 0; warn "Reading input for $arg\n" if $$p{DBUG}; my %suf = map {$_=>1} ($suf, lc $suf, uc $suf); my @suf = keys %suf; my $ext = join('$|', map { quotemeta($_) } @suf) . '$'; my @files; my $ulabel; if (-d $arg) { opendir(DIR, $arg) or die "Dir $arg: $!"; @files = map { File::Spec->catfile($arg, $_) } grep(/$ext/, readdir(DIR)); closedir(DIR); $arg =~ s:[\/\\]::; my @d = File::Spec->splitdir($arg); $ulabel = pop(@d); } elsif (-f $arg && $arg =~ /$ext/) { push(@files, $arg); $ulabel = basename($arg, @suf); } elsif (-f $arg && $arg =~ /list$|txt$/) { my $fh = new FileHandle $arg or die "$arg: $!"; @files = grep(!/^#|^filename/ && s/[\r\n]*$//, $fh->getlines); $fh->close; $ulabel = basename($arg, qw/.list .txt/); } else { warn "$arg is not a directory or recognized file type\n"; } if (@files) { my $patt = quotemeta($ext); my $num = 0; foreach my $f (@files) { my @f = split(/ *\t */, $f); next unless @f; my $base = basename($f[0], @suf); if ($sing) { my $set = sprintf "%s%02d", $label, ++$num; my $set2 = basename($f, @suf); $$hash{$set}{EXPS}{$base}{SOURCE} = $f; $$hash{$set}{EXPS}{$base}{NAME} = join("::", @f); $$hash{$set}{LABEL} = $set2; } else { $$hash{$label}{EXPS}{$base}{SOURCE} = $f; $$hash{$label}{EXPS}{$base}{NAME} = join("::", @f); } } $$hash{$label}{LABEL} = $ulabel unless $sing; } else { warn "No entries found in $arg\n"; } (scalar(@files)); } sub default_keyfile { my ($base, $path, $suff) = fileparse($0); my $kfile = File::Spec->catfile($path,"files", "straininfo.tab"); return (-f $kfile) ? $kfile: undef; } ################################################################################ ### MAIN PROGRAM my ($input, $p) = setup(); my $cLAB = 'control'; ($$p{SI}, $$p{FDS}) = get_strain_info($p); my @k = keys %$input; get_input_data($p, $input) if $$p{C_NUM} || $$p{T_NUM}; my $cdat = compile_ctrl_data($p, $input); my $tdat = compile_treat_data($p, $input); print_data_values($input, $cdat, $tdat, $p) if defined($$p{VSAVE}); my $td = tag_analysis($cdat, $tdat, $p); my ($res, $strains, $cols) = average_strain_tags($td, $input, $p); print_results($res, $strains, $cols, $input, $$p{OUTFILE}, $p) if $$p{OUTFILE}; if ($$p{IMGDIR}) { foreach my $col (sort keys %{$$p{IMGC}}) { my $sgc = $$p{IMGC}{$col}; my ($type, $label) = split(/::/, $col); my $pngf = sprintf "%s-%s.png", $label, $type; my $imgf = File::Spec->catfile($$p{IMGDIR}, $pngf); create_image($res, $col, $sgc, $type, $$p{SI}, $imgf); } } ################################################################################ ## Subroutines sub get_input_data { my $p = shift; my $input = shift; warn "\nGetting data\n"; my %expdata; ## Get all exp data at once. If exps are in both the controls and the ## treats, then they only have to be imported once. foreach my $setname (sort keys %$input) { my $fhash = $$input{$setname}{EXPS}; my $label = $$input{$setname}{LABEL}; $label = $label ? "$setname: $label" : $setname; foreach my $exp (sort keys %$fhash) { my ($data, @x); my $source = $$fhash{$exp}{SOURCE}; if ($expdata{$source} && $expdata{$source}{DATA}) { $$fhash{$exp}{DATA} = $expdata{$source}{DATA}; print STDERR " $exp -- already got data\n"; next; } elsif (-f $source) { $data = get_data_tabfile($source, $p); @x = ('', $source); } else { print STDERR " $exp -- could not retrieve data\n"; next; } if ($data && %$data) { $$fhash{$exp}{DATA} = $expdata{$source}{DATA} = $data; printf STDERR " %s%s -- %d data pts (%s)\n", $exp, $x[0], scalar(keys %$data), $label; } else { print STDERR " $exp -- no data in $x[1] ($label).\n"; } } } (\%expdata); } sub get_data_tabfile { my $file = shift; my $p = shift; my $fields = $$p{FIELDS}; my $fh = new FileHandle $file or die "$file: $!"; my @lines = grep(!/^#/ && /\w/ && s/[\r\n]*$//, $fh->getlines); $fh->close; my $head = shift @lines; my @head = split(/ *\t */, $head); my @cols = ('strain_tag', 'above_thresh', @$fields); my $idx = find_fields(\@head, \@cols, $file); return undef unless scalar(keys %$idx) > 1; my %data = (); foreach (@lines) { my @data = split(/ *\t */, $_); my $st = $data[$$idx{strain_tag}]; next unless defined($st) && $st !~ /unused/; foreach my $datf ("above_thresh", @$fields) { my $v = $data[$$idx{$datf}]; next unless defined($v); $data{$st}{$datf} = $v; } } (\%data); } #----------------------------------------------------------------------------# sub compile_ctrl_data { my $p = shift; my $input = shift; my $chash = $$input{control}; my $cdat; if (my $c_src = $$chash{INPUT}) { if ($$p{CS} && -f $c_src) { $cdat = parse_ctrlset_data_file($c_src, $p); } else { die "Don't know how to get control set data from $c_src\n"; } } else { my $cexps = $$p{C_EXPS} = [ sort grep(defined($$chash{EXPS}{$_}{DATA}), keys %{ $$chash{EXPS} }) ]; $cdat = get_mean_of_exps($chash, $cexps, $cLAB, $p, 1); } if ($$p{CSSAVE}) { save_ctrlset_file($cdat, $$p{CSSAVE}, $p); } %$cdat || die "No control data.\n"; ($cdat); } sub parse_ctrlset_data_file { my $cs_file = shift; my $p = shift; print STDERR "\nGetting control set data from \"$cs_file\"\n"; my $fh = new FileHandle $cs_file or die "$cs_file: $!"; my @lines = grep(/\w/ && s/[\r\n]*$//, $fh->getlines); $fh->close; my $head = shift @lines; if ($head =~ s/^#Control exps:\s*//) { my @cexps = sort split(/[\s,]+/, $head); $$p{C_EXPS} = \@cexps; $head = shift @lines; } my @h = split(/ *\t */, $head); my $cdat = reformat_ctrl_data(\@lines, \@h, $p); ($cdat); } sub save_ctrlset_file { my $cdat = shift; my $outfile = shift; my $p = shift; my @t = @{ $$p{ANALYSES} }; my @f = qw/avg stddev/; print STDERR "Writing control set data to $outfile\n\n"; my $ofh = new FileHandle ">$outfile" or die ">$outfile: $!"; my @head = ('strain_tag', 'above_thresh'); foreach my $datf (@{ $$p{FIELDS} }) { my $col = $DB_COLS{$datf}; foreach my $type (@t) { foreach my $f (@f) { push(@head, $$col{$f}{$type}) } } } my $cexps = ref($$p{C_EXPS}) eq 'ARRAY' ? $$p{C_EXPS} : []; print $ofh "#Control exps: ". join(", ", @$cexps)."\n"; print $ofh join("\t", @head)."\n"; foreach my $st (sort keys %$cdat) { my $at = defined($$cdat{$st}{above_thresh}) ? $$cdat{$st}{above_thresh} : ''; my @row = ($st, $at); foreach my $datf (@{ $$p{FIELDS} }) { foreach my $type (@t) { foreach my $f (@f) { my $v = $$cdat{$st}{$datf}{$type}{$f}; push(@row, defined($v) ? $v : ''); } } } print $ofh join("\t", @row)."\n"; } $ofh->close; } sub reformat_ctrl_data { my $list = shift; my $cols = shift; my $p = shift; shift @$cols; #strain_tag shift @$cols; #above_thresh my @f; foreach my $datf (@{ $$p{FIELDS} }) { my $hcol = $DB_COLS{$datf}; for(my $i=0; $i<@$cols; $i++) { if (my $f = $$hcol{$$cols[$i]}) { $f[$i] = $f; } } } unless (grep(defined($_), @f)) { my $datf = join(", ", @{ $$p{FIELDS} }); die " No $datf data retrieved.\n"; } my %cdat; foreach my $l (grep(!/^#/ && /\w/, @$list)) { my ($st, $at, @v) = ref($l) eq 'ARRAY' ? @$l : split(/ *\t */, $l); my %h; my $tag_good = $at >= $$p{THRESH} ? 1 : 0; foreach my $f (@f) { my $v = shift @v; next unless $f && defined($v); $h{$$f[0]}{$$f[1]}{$$f[2]} = $v; } foreach my $f (keys %h) { foreach my $t (keys %{$h{$f}}) { my $hft = $h{$f}{$t}; my $minint = $t eq $ZS && $$p{MININT} ? log2($$p{MININT}) : $$p{MININT}; my ($cv, $gt) = cv_goodtag($$hft{avg}, $$hft{stddev}, $minint, $$p{MAXCV}); if (defined($cv)) { $$hft{CV} = $cv; $$hft{GOOD_TAG} = $gt && $tag_good; $h{above_thresh} = $at; $cdat{$st} = \%h; } } } } (\%cdat); } sub get_mean_of_exps { my $hash = shift; my $exps = shift; my $label = shift; my $p = shift; my $isCtrl = shift; unless (ref($exps) eq 'ARRAY' && @$exps) { warn "No data for $label\n" } $isCtrl ? warn "Getting mean and stddev -- $label\n" : printf STDERR "%s -- $label\n", @$exps==1 ? "Transforming data" : "Getting mean"; my %alldat; my $log2 = log(2); foreach my $exp (@$exps) { my $data = $$hash{EXPS}{$exp}{DATA}; foreach my $st (keys %$data) { my @f = keys %{ $$data{$st} }; foreach my $datf (grep(!/strain_tag|above_thresh/, @f)) { my $v = $$data{$st}{$datf}; if (defined($v)) { push(@{ $alldat{$st}{$datf}{$CC} }, $v) if $$p{RATIO}; push(@{ $alldat{$st}{$datf}{$ZS} }, log($v)/$log2) if $v && $$p{ZS}; } } if ($isCtrl) { my $v = $$data{$st}{above_thresh}; push(@{ $alldat{$st}{TAG} }, $v) if defined($v); } } } my %msdat; foreach my $st (keys %alldat) { my ($tag_good, $at); if ($isCtrl) { my $tag_vlist = $alldat{$st}{TAG}; $at = mean($tag_vlist); $tag_good = $at >= $$p{THRESH} ? 1 : 0; } my @f = grep(!/TAG/, keys %{ $alldat{$st} }); foreach my $datf (@f) { foreach my $t (keys %{ $alldat{$st}{$datf} }) { my $vlist = $alldat{$st}{$datf}{$t}; my $m = mean($vlist); $msdat{$st}{$datf}{$t}{avg} = sprintf "%7.4f", $m; my $minint = $t eq $ZS && $$p{MININT} ? log2($$p{MININT}):$$p{MININT}; if ($isCtrl) { my $sd = stddev($vlist); my ($cv, $gt) = cv_goodtag($m, $sd, $minint, $$p{MAXCV}); $msdat{$st}{$datf}{$t}{stddev} = sprintf "%7.4f", $sd; $msdat{$st}{$datf}{$t}{CV} = $cv; $msdat{$st}{$datf}{$t}{GOOD_TAG} = $gt && $tag_good; $msdat{$st}{above_thresh} = $at; } } } } (\%msdat); } #----------------------------------------------------------------------------# sub compile_treat_data { my $p = shift; my $input = shift; my %tdat; foreach my $setname (sort grep(/^treat/, keys %$input)) { my $thash = $$input{$setname}; my $texps = $$p{T_EXPS}{$setname} = [sort grep(defined($$thash{EXPS}{$_}{DATA}), keys %{$$thash{EXPS}})]; my $tlab = $$thash{LABEL} ? "$setname: $$thash{LABEL}" : $setname; my $setdat = get_mean_of_exps($thash, $texps, $tlab, $p); if (%$setdat) { $tdat{$setname} = $setdat; } else { warn "No treat data for set $tlab.\n"; } } (\%tdat); } #----------------------------------------------------------------------------- sub print_data_values { my $input = shift; my $cdat = shift; my $tdat = shift; my $p = shift; my @t = @{ $$p{ANALYSES} }; my @strains = sort keys %$cdat; my @f = qw/avg stddev CV GOOD_TAG/; my @tf = qw/avg/; my %data; my @head; my $cexps = [ sort grep(defined($$input{$cLAB}{EXPS}{$_}{DATA}), keys %{ $$input{$cLAB}{EXPS} }) ]; my $clabel = $$input{$cLAB}{LABEL} || $cLAB; header_column_info(\@head, \%data, $cdat, \@f, $p, $$input{$cLAB}{EXPS}, $cexps, $clabel, $cLAB); if ($$p{T_EXPS} && %$tdat) { foreach my $tset (sort keys %$tdat) { my $texps = ref($$p{T_EXPS}{$tset}) eq 'ARRAY' ? $$p{T_EXPS}{$tset} : []; my $tlabel = $$input{$tset}{LABEL} || $tset; $tlabel = $tset if @$texps == 1; if (@$texps > 1) { header_column_info(\@head, \%data, $$tdat{$tset}, \@tf, $p, $$input{$tset}{EXPS}, $texps, $tlabel, $tset); } else { header_column_info(\@head, \%data, $$tdat{$tset}, [], $p, $$input{$tset}{EXPS}, $texps, $tlabel, $tset); } } } my $outfile = $$p{VSAVE} ? $$p{VSAVE} : 'fp_dbug.txt'; print STDERR "\nWriting data values to $outfile\n"; my $ofh = new FileHandle ">$outfile" or die ">$outfile: $!"; print_params($ofh, $input, $p); print $ofh "orf_name\tbatches\t".join("\t", @head)."\n"; foreach my $st (@strains) { my @row = split(/::/, $st, 2); foreach my $colname (@head) { my ($hash, @var) = @{ $data{$colname} }; my $v = $$hash{$st}; foreach my $va (@var) { last if !defined($v); $v = $$v{$va}; } push(@row, defined($v) ? $v : ''); } print $ofh join("\t", @row)."\n"; } $ofh->close; } sub header_column_info { my $head = shift; my $data = shift; my $dathash = shift; my $fields = shift; my $p = shift; my $inhash = shift; my $explist = shift; my $label = shift; my $lab2 = shift; my @t = @{ $$p{ANALYSES} }; foreach my $datf (@{$$p{FIELDS}}) { my $col = $DB_COLS{$datf}; foreach my $type (@t) { foreach my $f (@$fields) { my $colname = sprintf "%s:%s", $label, $$col{$f}{$type}; push(@$head, $colname); $$data{$colname} = [ $dathash, $datf, $type, $f ]; } } foreach my $exp (@$explist) { my $colname = sprintf "%s:%s:%s", $label eq $exp ? $lab2 : $label, shortname($datf), $exp; push(@$head, $colname); $$data{$colname} = [ $$inhash{$exp}{DATA}, $datf ]; } } } sub shortname { my $name = shift; $name =~ s/^(.).*_(.).*/$1$2/; ($name); } #----------------------------------------------------------------------------# sub get_strain_info { my $p = shift; if (my $keyfile = $$p{KEYFILE}) { warn "Getting strain data from $keyfile\n"; return parse_strain_info($keyfile); } else { return {}; } } sub parse_strain_info { my $keyfile = shift; my $fh = new FileHandle $keyfile or die "$keyfile: $!"; my @lines = $fh->getlines; foreach (@lines) {s/[\r\n]+$//} $fh->close; my %si; my @fds = split(/ *\t */, shift @lines); foreach (@lines) { my @d = split(/ *\t */, $_); my %h = map { $_ => shift @d } @fds; $h{strain} || next; $si{ $h{strain} } = \%h; } (\%si, \@fds); } #----------------------------------------------------------------------------# sub tag_analysis { my $cdat = shift; my $tdat = shift; my $p = shift; my %data; warn "\nAnalyzing tag data\n"; my $p90 = $$p{ZS} && $$p{FUDGE} ? prctile($cdat, 90, $p) : 0; foreach my $set (sort keys %$tdat) { my (%d, %tg); print STDERR " Analyzing $set:"; $d{$CC} = tag_ratios($cdat, $tdat, $set, \%tg, $p) if $$p{RATIO}; $d{$ZS} = tag_zscores($cdat, $tdat, $set, \%tg, $p90, $p) if $$p{ZS}; $d{TG} = \%tg; $data{$set} = \%d; print STDERR "\n"; } (\%data); } sub tag_ratios { my $ctrl = shift; my $tdat = shift; my $set = shift; my $tg = shift; my $p = shift; my %r; print STDERR " ratios ..."; my $expt = $$tdat{$set}; foreach my $st (sort keys %$ctrl) { my ($strain, $taggroup) = split_strain_tag($st); foreach my $datf (grep(!/above_thresh/, keys %{ $$ctrl{$st} })) { my $c = $$ctrl{$st}{$datf}{$CC}; my $e = $$expt{$st}{$datf}{$CC}; if (defined($$c{avg}) && defined($$e{avg})) { $$tg{$taggroup} = 1; if ($$c{GOOD_TAG}) { my $cv = $$c{avg} - $$p{BG}; my $ev = $$e{avg} - $$p{BG}; $cv = $ZERO if abs($cv) < $ZERO; $ev = $ZERO if abs($ev) < $ZERO; my $rn = log2($cv/$ev); $r{$datf}{$strain}{$taggroup} = $$p{CFX} ? $rn/exp($$p{CFX}*$cv) : $rn; } else { $r{$datf}{$strain}{$taggroup} = undef; } } } } (\%r); } ## actually -1 * z-score sub tag_zscores { my $ctrl = shift; my $tdat = shift; my $set = shift; my $tg = shift; my $p90 = shift; my $p = shift; my %z; print STDERR " Z-scores ..."; my $expt = $$tdat{$set}; foreach my $st (sort keys %$ctrl) { my ($strain, $taggroup) = split_strain_tag($st); foreach my $datf (grep(!/above_thresh/, keys %{ $$ctrl{$st} })) { my $fudge = $$p{FUDGE} && $$p90{$datf} ? $$p90{$datf} : 0; my $c = $$ctrl{$st}{$datf}{$ZS}; my $e = $$expt{$st}{$datf}{$ZS}; if (defined($$c{stddev}) && defined($$e{avg})) { $$tg{$taggroup} = 1; if ($$c{GOOD_TAG} && abs($$c{stddev})>$ZERO) { my $zn = $$c{avg} - $$e{avg}; my $zd = $$c{stddev}<$ZERO ? $ZERO : $$c{stddev}; $z{$datf}{$strain}{$taggroup} = $zn/($zd+$fudge); } else { $z{$datf}{$strain}{$taggroup} = undef; } } } } (\%z); } sub prctile { my $ctrl = shift; my $prc = shift; my %stddev; my $datf = $$p{FIELDS}; foreach my $st (keys %$ctrl) { foreach my $df (@$datf) { if (my $c = $$ctrl{$st}{$df}{$ZS}) { if (defined($$c{stddev})) { push(@{ $stddev{$df} }, $$c{stddev}); } } } } my %p; foreach my $df (keys %stddev) { my $N = my @sdsort = sort {$a<=>$b} @{ $stddev{$df} }; my $i = int($N*$prc/100); $p{$df} = $sdsort[$i]; printf STDERR " Fudge %-13s = %s, %d (%d)\n", $df, $sdsort[$i] || 'undef', $i, $N; } (\%p); } sub average_strain_tags { my $data = shift; my $input = shift; my $p = shift; my %res; my %strains; my @columns; warn "\nAveraging strain tags\n"; my $df = scalar(@{ $$p{C_EXPS} }) - 1; foreach my $set (sort keys %$data) { my @tags = reverse sort keys %{$$data{$set}{TG}}; my $tlabel = $$input{$set}{LABEL} || $set; foreach my $datf (@{ $$p{FIELDS} }) { foreach my $atype (@{ $$p{ANALYSES} }) { my $d = $$data{$set}{$atype}{$datf}; my $col = AR_COLS($atype, $datf, 'result', $tlabel); my @subargs = ($atype, $datf, $tlabel, \%res, \@columns, $p); average_tags($col, \%strains, $d, $df, @subargs); add_individual_tag_data(\@tags, $d, @subargs) if $$p{ALLTAGS}; add_sig_and_rank($col, @subargs); } } } (\%res, \%strains, \@columns); } sub AR_COLS { my $atype = shift; my $datf = shift; my $vtype = shift; my $set = shift; my $pre = $atype eq $ZS ? 'z_' : 'ratio_'; my $post = $datf eq 'norm_quantile' ? '_nq' : '_nm'; my $col = $pre.$vtype.$post; $col =~ s/::/-/g; $col .= "::".$set if $set; ($col); } sub average_tags { my $col = shift; my $strains = shift; my $data = shift; my $df = shift; my $atype = shift; my $datf = shift; my $tlabel = shift; my $res = shift; my $columns = shift; my $p = shift; push(@$columns, $col); my $pcol; if ($$p{PVAL} && $atype eq $ZS) { $pcol = AR_COLS($atype, $datf, "pval", $tlabel); push(@$columns, $pcol); } my @strains = keys %$data; foreach my $s (@strains) { $$strains{$s} = 1; # update strains my @v = grep(defined($_), values %{ $$data{$s} }); next unless @v; my $fit = $$res{$s}{$col} = mean(\@v); if ($pcol) { my $pv = Statistics::Distributions::tprob($df, $fit); if ($pv == 0) { $pv = 1e-20; } $$res{$s}{$pcol} = min([1, $pv]); } } } sub add_individual_tag_data { my $tags = shift; my $data = shift; my $atype = shift; my $datf = shift; my $tlabel = shift; my $res = shift; my $columns = shift; my $p = shift; foreach my $t (@$tags) { my $tcol = AR_COLS($atype, $datf, $t, $tlabel); push(@$columns, $tcol); foreach my $s (keys %$data) { my $v = $$data{$s}{$t}; $$res{$s}{$tcol} = $v if defined($v); } } } sub add_sig_and_rank { my $col = shift; my $atype = shift; my $datf = shift; my $tlabel = shift; my $res = shift; my $columns = shift; my $p = shift; my @strains = grep(defined($$res{$_}{$col}), keys %$res); my @v = map { $$res{$_}{$col} } @strains; return unless @v; my $m = sprintf "%7.4f ", mean(\@v); my $sd = sprintf "%7.4f ", stddev(\@v); my $sig = sprintf "%7.4f ", $m+3.5*$sd; my $rank = 0; my $sgc = AR_COLS($atype, $datf, 'sig', $tlabel); my $rkc = AR_COLS($atype, $datf, 'rank', $tlabel); push(@$columns, $sgc, $rkc); my $numsig = 0; if ($$p{IMGDIR}) { $$p{IMGC}{$col} = $sgc; } foreach my $s (reverse sort { $$res{$a}{$col}<=>$$res{$b}{$col}} @strains) { my $v = $$res{$s}{$col}; $$res{$s}{$sgc} = $v>$sig ? 1:0; $$res{$s}{$rkc} = ++$rank; $numsig++ if $v>$sig; } } #----------------------------------------------------------------------------# sub print_results { my $res = shift; my $strains = shift; my $columns = shift; my $input = shift; my $outfile = shift; my $p = shift; my $si = $$p{SI}; my $fds = $$p{FDS}; ## create header of column names my @outcols = @$columns; unless ($$p{ADD_SIG}) { @outcols = grep(!/_sig_/, @outcols) } unless ($$p{ADD_RANK}) { @outcols = grep(!/_rank_/, @outcols) } unless ($$p{RESULTTAGS}) { @outcols = grep(!/_result_/, @outcols) } my $head = "strain\tgene\t".join("\t", @outcols); my @desc = grep(!/barcode|^strain|^gene/, @$fds); $head .= "\t" . join("\t", @desc); ## print results to file warn "Writing results to $outfile\n"; my $ofh = new FileHandle ">$outfile" or die ">$outfile: $!"; print_params($ofh, $input, $p); print $ofh "$head\n"; foreach my $s (sort keys %$strains) { $$si{$s} = {} unless $$si{$s}; my @row = ($s, $$si{$s}{gene}||$s); foreach my $col (@outcols) { my $v = $$res{$s}{$col}; push(@row, defined($v) ? $v : ''); } foreach my $d (@desc) { push(@row, $$si{$s}{$d}||''); } print $ofh join("\t", @row) . "\n"; } } sub print_params { my $ofh = shift; my $input = shift; my $p = shift; my @params; foreach my $var (qw/minInt maxCV bg cfx/) { my $V = uc $var; next if ($V eq 'MAXCV' && $$p{$V} == 100) || $$p{$V} == 0; push(@params, sprintf "$var=%s", $$p{$V}); } print $ofh "# " . join("; ", @params)."\n" if @params; print $ofh "# Z-scores computed with fudge factor\n" if $$p{ZS} && $$p{FUDGE}; my $clabel = $$input{$cLAB}{LABEL} || $cLAB; print $ofh "#Control exps ($clabel): ".join(", ", @{ $$p{C_EXPS} })."\n"; foreach my $set (sort keys %{ $$p{T_EXPS} }) { my $texps = $$p{T_EXPS}{$set}; my $tlabel = $$input{$set}{LABEL} || $set; $tlabel = $set if @$texps == 1; print $ofh "#Treat exps ($tlabel): ". join(", ", @$texps)."\n"; } } ############################################################################### sub split_strain_tag { my $st = shift; my ($o, $b, @t) = split(/:+/, $st); my $strain = join("::", $o, $b); my $taggroup = join(":", @t); ($strain, $taggroup); } sub find_fields { my $fields = shift; my $find = shift; my $file = shift; my %hash; for(my $i=0; $i<@$fields; $i++) { foreach my $f (@$find) { $hash{$f} = $i if $$fields[$i] eq $f; } } foreach my $f (@$find) { unless (defined($hash{$f})) { warn "No field $f in $file\n"; } } (\%hash); } ##---------------------------------------------------------------------------- sub cv_goodtag { my $m = shift; my $sd = shift; my $minint = shift || 0; my $maxcv = shift || 100; return undef unless defined($m) && defined($sd); my $cv = $m ? sprintf "%6.4f", $sd/$m : 99; my $gt = $m>$minint && $cv<$maxcv ? 1:0; ($cv, $gt); } sub log2 { my $v = shift; my $l2 = log(2); return log($v)/$l2; } sub min { my $list = shift; return unless ref($list) eq 'ARRAY' && @$list; return ($$list[0]) if @$list == 1; my ($x) = sort {$a<=>$b} @$list; return $x; } sub max { my $list = shift; return unless ref($list) eq 'ARRAY' && @$list; return ($$list[0]) if @$list == 1; my ($x) = reverse sort {$a<=>$b} @$list; return $x; } sub median { my $list = shift; return unless ref($list) eq 'ARRAY' && @$list; return $$list[0] unless @$list>1; my @l = sort{$a<=>$b}@$list; return $l[$#l/2] if @l&1; my $mid= @l/2; return ($l[$mid-1]+$l[$mid])/2; } sub mean { my $list = shift; return unless ref($list) eq 'ARRAY' && @$list; return $$list[0] if @$list == 1; my $sum; foreach (@$list) { $sum += $_; } return $sum/scalar(@$list); } sub stddev { my $list = shift; return unless ref($list) eq 'ARRAY' && @$list; return 0 if @$list == 1; my $m = mean($list); my $nume = 0; foreach (@$list) { $nume += ($_-$m)**2; } return sqrt($nume/$#$list); } ############################################################################## sub create_image { my $res = shift; my $col = shift; my $sigcol = shift; my $type = shift; my $si = shift; my $pngf = shift; print STDERR " Creating image $pngf\n"; my ($w, $h, $xs, $ys) = get_dimensions($res, $col, 800); my $im = new GD::Image($w, $h); my $c = allocate_colors($im); my $font = $$p{FONT}; generate_axes($im, $font, $c, $xs, $ys, $type); my $xnum = 0; foreach my $st (sort {my $ka = $$si{$a}{gene}||$a; my $kb = $$si{$b}{gene}||$b; $ka cmp $kb } keys %$res) { my $y = yval($$res{$st}{$col}, $ys); my $x = xval(++$xnum, $xs); next unless defined($y); if ($$res{$st}{$sigcol}) { my $gene = $$si{$st}{gene} || $st; $im->ellipse($x-1, $y-1, 6, 6, $$c{red}); $im->stringFT($$c{blk}, $font, 8, 0, $x+5, $y+4, $gene); } else { $im->rectangle($x-2, $y-3, $x+2, $y+1, $$c{blu}); } } my $ofh = new FileHandle ">$pngf" or die ">$pngf: $!"; binmode($ofh); print $ofh $im->png; $ofh->close; } sub get_dimensions { my $dat = shift; my $col = shift; my $dim = shift; my $margin = 50; my @v = sort {$a<=>$b} grep(defined($_), map {$$dat{$_}{$col}} keys %$dat); my $w = $dim; my $h = $dim; my $plotw = $dim - 2*$margin; my (%xs, %ys); $xs{s} = $margin; $xs{e} = $dim-$margin; $xs{max} = scalar(keys %$dat); $xs{inc} = $plotw/$xs{max}; $ys{s} = $margin; $ys{e} = $dim-$margin; $ys{top} = $dim; $ys{max} = int($v[$#v])+1; $ys{min} = int($v[0]-1); $ys{inc} = $plotw/($ys{max}-$ys{min}); ($w, $h, \%xs, \%ys); } sub xval { my $x = shift; my $xs = shift; my $xval = $$xs{s} + int($x*$$xs{inc}); ($xval); } sub yval { my $y = shift; my $ys = shift; return unless defined($y); my $yval = $$ys{top} - ($$ys{s} + int(($y-$$ys{min})*$$ys{inc})); ($yval); } sub generate_axes { my $im = shift; my $font = shift; my $c = shift; my $xs = shift; my $ys = shift; my $type = shift; if ($type =~ /^z/) { $type = "Z-score" } my ($sx, $ex) = ($$xs{s}, $$xs{e}); my $sy = $$ys{top} - $$ys{s}; my $ey = $$ys{top} - $$ys{e}; my $y0 = yval(0, $ys); $im->line($sx, $sy, $ex, $sy, $$c{blk}); # x axis $im->line($sx, $y0, $ex, $y0, $$c{blk}); # line y=0 $im->line($sx, $sy, $sx, $ey, $$c{blk}); # y axis # x tick marks for(my $i=1000; $i<$$xs{max}-600; $i+=1000) { my $xcoord = xval($i, $xs); $im->line($xcoord, $sy-5, $xcoord, $sy+5, $$c{blk}); $im->stringFT($$c{blk}, $font, 9, 0, $xcoord-15, $sy+15, $i); } $im->line($ex, $sy-5, $ex, $sy+5, $$c{blk}); $im->stringFT($$c{blk}, $font, 9, 0, $ex-15, $sy+15, $$xs{max}); # y tick marks my $yinc = int(($$ys{max}-$$ys{min})/10) || 0.5; for(my $i=$$ys{min}; $i<=$$ys{max}; $i+=$yinc) { my $ycoord = yval($i, $ys); $im->line($sx-5, $ycoord, $sx+5, $ycoord, $$c{blk}); $im->stringFT($$c{blk}, $font, 9, 1.57, $sx-8, $ycoord+5, $i); } # x label $im->stringFT($$c{blk}, $font, 10, 0, $sx+330, $sy+30, "Gene"); # y label $im->stringFT($$c{blk}, $font, 10, 1.57, $sx-30, $sy-330, $type); } sub allocate_colors { my $im = shift; my %c; $c{bg} = $im->colorAllocate(255,255,255); $c{blk} = $im->colorAllocate(0,0,0); $c{gry} = $im->colorAllocate(80,80,80); $c{blu} = $im->colorAllocate(0,0,255); $c{red} = $im->colorAllocate(255,0,0); (\%c); } ############################################################################# =head1 USAGE EXAMPLES =over 4 =item B Calculate z-score and ratios using .norm files in the directory "ctrl" as the controls against .norm files in the directory "treat". Add gene and GO information to the output using information in the keyfile "files/straininfo.tab". =item B Calculate z-scores using both quantile normalized values and mean normalized values. Controls are in the directory "ctrl" and treat data are in the directory "treat. Omit any tag with an average control intensity less than 200 from the analysis. Save the control values to file "myctrlset.txt". =item B Same as above but use the values in the file "myctrlset.txt" as the control values instead of calculating from .norm files. Also, compare each treat experiment singly to the control data instead of averaging all treat data together. =back =cut