diff --git a/mzLib/FlashLFQ/FlashLFQResults.cs b/mzLib/FlashLFQ/FlashLFQResults.cs index ccc0448e6..a680a0f7c 100644 --- a/mzLib/FlashLFQ/FlashLFQResults.cs +++ b/mzLib/FlashLFQ/FlashLFQResults.cs @@ -1,11 +1,9 @@ -using Easy.Common.Extensions; -using MathNet.Numerics.Statistics; +using FlashLFQ.IsoTracker; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; -using FlashLFQ.IsoTracker; namespace FlashLFQ { @@ -62,6 +60,136 @@ public FlashLfqResults(List spectraFiles, List } } + public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, string proteinOutputPath, string bayesianProteinQuantOutput, bool silent) + { + if (!silent) + { + Console.WriteLine("Writing output..."); + } + + if (peaksOutputPath != null) + { + using (StreamWriter output = new StreamWriter(peaksOutputPath)) + { + output.WriteLine(ChromatographicPeak.TabSeparatedHeader); + + foreach (var peak in Peaks.SelectMany(p => p.Value) + .OrderBy(p => p.SpectraFileInfo.FilenameWithoutExtension) + .ThenByDescending(p => p.Intensity)) + { + output.WriteLine(peak.ToString()); + } + } + } + + if (modPeptideOutputPath != null) + { + using (StreamWriter output = new StreamWriter(modPeptideOutputPath)) + { + if (IsoTracker) + { + output.WriteLine(Peptide.TabSeparatedHeader_IsoTracker(SpectraFiles)); + // we want to output with same iso group index followed by peak order. + foreach (var peptide in PeptideModifiedSequences + .OrderBy(p => p.Value.IsoGroupIndex ?? int.MaxValue) + .ThenBy(p => p.Value.PeakOrder ?? int.MinValue)) + { + output.WriteLine(peptide.Value.ToString(SpectraFiles, IsoTracker)); + } + } + else + { + output.WriteLine(Peptide.TabSeparatedHeader(SpectraFiles)); + foreach (var peptide in PeptideModifiedSequences.OrderBy(p => p.Key)) + { + output.WriteLine(peptide.Value.ToString(SpectraFiles, IsoTracker)); + } + } + } + } + + if (proteinOutputPath != null) + { + using (StreamWriter output = new StreamWriter(proteinOutputPath)) + { + output.WriteLine(ProteinGroup.TabSeparatedHeader(SpectraFiles)); + + foreach (var protein in ProteinGroups.OrderBy(p => p.Key)) + { + output.WriteLine(protein.Value.ToString(SpectraFiles)); + } + } + } + + if (bayesianProteinQuantOutput != null) + { + StringBuilder header = new StringBuilder(); + StringBuilder[] proteinStringBuilders = new StringBuilder[ProteinGroups.Count]; + + for (int i = 0; i < proteinStringBuilders.Length; i++) + { + proteinStringBuilders[i] = new StringBuilder(); + } + + using (StreamWriter output = new StreamWriter(bayesianProteinQuantOutput)) + { + if (!ProteinGroups.Any()) + { + return; + } + + var firstProteinQuantResults = ProteinGroups.First().Value.ConditionToQuantificationResults; + + if (!firstProteinQuantResults.Any()) + { + return; + } + + string tabSepHeader = null; + + if (firstProteinQuantResults.First().Value is PairedProteinQuantResult) + { + tabSepHeader = PairedProteinQuantResult.TabSeparatedHeader(); + } + else + { + tabSepHeader = UnpairedProteinQuantResult.TabSeparatedHeader(); + } + + foreach (var condition in firstProteinQuantResults.Keys) + { + header.Append(tabSepHeader); + + int p = 0; + + // sort by protein false discovery rate, then by number of measurements + foreach (var protein in ProteinGroups + .OrderByDescending(v => v.Value.ConditionToQuantificationResults[condition].IsStatisticallyValid) + .ThenByDescending(v => v.Value.ConditionToQuantificationResults[condition].BayesFactor) + .ThenByDescending(v => v.Value.ConditionToQuantificationResults[condition].Peptides.Count)) + { + proteinStringBuilders[p].Append( + protein.Value.ConditionToQuantificationResults[condition].ToString()); + + p++; + } + } + + output.WriteLine(header); + + foreach (var proteinStringBuilder in proteinStringBuilders) + { + output.WriteLine(proteinStringBuilder); + } + } + } + + if (!silent) + { + Console.WriteLine("Finished writing output"); + } + } + public void ReNormalizeResults(bool integrate = false, int maxThreads = 10, bool useSharedPeptides = false) { foreach(var peak in Peaks.SelectMany(p => p.Value)) @@ -315,53 +443,14 @@ private void HandleAmbiguityInFractions() public void CalculateProteinResultsTop3(bool useSharedPeptides) { - foreach (var proteinGroup in ProteinGroups) - { - foreach (SpectraFileInfo file in SpectraFiles) - { - proteinGroup.Value.SetIntensity(file, 0); - } - } - - int topNPeaks = 3; - - List peptides = PeptideModifiedSequences.Values.Where(p => p.UnambiguousPeptideQuant()).ToList(); - Dictionary> proteinGroupToPeptides = new Dictionary>(); - - foreach (Peptide peptide in peptides) - { - if (!peptide.UseForProteinQuant) - { - continue; - } - - foreach (ProteinGroup pg in peptide.ProteinGroups) - { - if (proteinGroupToPeptides.TryGetValue(pg, out var peptidesForThisProtein)) - { - peptidesForThisProtein.Add(peptide); - } - else - { - proteinGroupToPeptides.Add(pg, new List { peptide }); - } - } - } - - foreach (ProteinGroup pg in ProteinGroups.Values) - { - if (proteinGroupToPeptides.TryGetValue(pg, out var peptidesForThisProtein)) - { - foreach (SpectraFileInfo file in SpectraFiles) - { - // top N peptides in the file - double proteinIntensity = peptidesForThisProtein.Where(p => p.ProteinGroups.Count == 1 || useSharedPeptides) - .Select(p => p.GetIntensity(file)).OrderByDescending(p => p).Take(topNPeaks).Sum(); - - pg.SetIntensity(file, proteinIntensity); - } - } - } + var strategy = new Top3ProteinQuantificationStrategy(); + var engine = new GenericProteinQuantificationEngine( + strategy, + PeptideModifiedSequences, + ProteinGroups, + SpectraFiles); + + engine.Run(useSharedPeptides); } /// @@ -370,423 +459,14 @@ public void CalculateProteinResultsTop3(bool useSharedPeptides) /// public void CalculateProteinResultsMedianPolish(bool useSharedPeptides) { - // reset protein intensities to 0 - foreach (var proteinGroup in ProteinGroups) - { - foreach (SpectraFileInfo file in SpectraFiles) - { - proteinGroup.Value.SetIntensity(file, 0); - } - } - - // associate peptide w/ proteins in a dictionary for easy lookup - List peptides = PeptideModifiedSequences.Values.Where(p => p.UnambiguousPeptideQuant()).ToList(); - Dictionary> proteinGroupToPeptides = new Dictionary>(); - - foreach (Peptide peptide in peptides) - { - if (!peptide.UseForProteinQuant || (peptide.ProteinGroups.Count > 1 && !useSharedPeptides)) - { - continue; - } - - foreach (ProteinGroup pg in peptide.ProteinGroups) - { - if (proteinGroupToPeptides.TryGetValue(pg, out var peptidesForThisProtein)) - { - peptidesForThisProtein.Add(peptide); - } - else - { - proteinGroupToPeptides.Add(pg, new List { peptide }); - } - } - } - - var filesGroupedByCondition = SpectraFiles.GroupBy(p => p.Condition).ToList(); - - // quantify each protein - foreach (ProteinGroup proteinGroup in ProteinGroups.Values) - { - if (proteinGroupToPeptides.TryGetValue(proteinGroup, out var peptidesForThisProtein)) - { - // set up peptide intensity table - // top row is the column effects, left column is the row effects - // the other cells are peptide intensity measurements - int numSamples = SpectraFiles.Select(p => p.Condition + p.BiologicalReplicate).Distinct().Count(); - double[][] peptideIntensityMatrix = new double[peptidesForThisProtein.Count + 1][]; - for (int i = 0; i < peptideIntensityMatrix.Length; i++) - { - peptideIntensityMatrix[i] = new double[numSamples + 1]; - } - - // populate matrix w/ log2-transformed peptide intensities - // if a value is missing, it will be filled with NaN - int sampleN = 0; - foreach (var group in SpectraFiles.GroupBy(p => p.Condition).OrderBy(p => p.Key)) - { - foreach (var sample in group.GroupBy(p => p.BiologicalReplicate).OrderBy(p => p.Key)) - { - foreach (Peptide peptide in peptidesForThisProtein) - { - double sampleIntensity = 0; - double highestFractionIntensity = 0; - - // the fraction w/ the highest intensity is used as the sample intensity for this peptide. - // if there is more than one replicate of the fraction, then the replicate intensities are averaged - foreach (var fraction in sample.GroupBy(p => p.Fraction)) - { - double fractionIntensity = 0; - int replicatesWithValidValues = 0; - - foreach (SpectraFileInfo replicate in fraction.OrderBy(p => p.TechnicalReplicate)) - { - double replicateIntensity = peptide.GetIntensity(replicate); - - if (replicateIntensity > 0) - { - fractionIntensity += replicateIntensity; - replicatesWithValidValues++; - } - } - - if (replicatesWithValidValues > 0) - { - fractionIntensity /= replicatesWithValidValues; - } - - if (fractionIntensity > highestFractionIntensity) - { - highestFractionIntensity = fractionIntensity; - sampleIntensity = highestFractionIntensity; - } - } - - int sampleNumber = sample.Key; - - if (sampleIntensity == 0) - { - sampleIntensity = double.NaN; - } - else - { - sampleIntensity = Math.Log(sampleIntensity, 2); - } - - peptideIntensityMatrix[peptidesForThisProtein.IndexOf(peptide) + 1][sampleN + 1] = sampleIntensity; - } - - sampleN++; - } - } - - // if there are any peptides that have only one measurement, mark them as NaN - // unless we have ONLY peptides with one measurement - var peptidesWithMoreThanOneMmt = peptideIntensityMatrix.Skip(1).Count(row => row.Skip(1).Count(cell => !double.IsNaN(cell)) > 1); - if (peptidesWithMoreThanOneMmt > 0) - { - for (int i = 1; i < peptideIntensityMatrix.Length; i++) - { - int validValueCount = peptideIntensityMatrix[i].Count(p => !double.IsNaN(p) && p != 0); - - if (validValueCount < 2 && numSamples >= 2) - { - for (int j = 1; j < peptideIntensityMatrix[0].Length; j++) - { - peptideIntensityMatrix[i][j] = double.NaN; - } - } - } - } - - // do median polish protein quantification - // row effects in a protein can be considered ~ relative ionization efficiency - // column effects are differences between conditions - MedianPolish(peptideIntensityMatrix); - - double overallEffect = peptideIntensityMatrix[0][0]; - double[] columnEffects = peptideIntensityMatrix[0].Skip(1).ToArray(); - double referenceProteinIntensity = Math.Pow(2, overallEffect) * peptidesForThisProtein.Count; - - // check for unquantifiable proteins; these are proteins w/ quantified peptides, but - // the protein is still not quantifiable because there are not peptides to compare across runs - List possibleUnquantifiableSample = new List(); - sampleN = 0; - foreach (var group in SpectraFiles.GroupBy(p => p.Condition).OrderBy(p => p.Key)) - { - foreach (var sample in group.GroupBy(p => p.BiologicalReplicate).OrderBy(p => p.Key)) - { - bool isMissingValue = true; - - foreach (SpectraFileInfo spectraFile in sample) - { - if (peptidesForThisProtein.Any(p => p.GetIntensity(spectraFile) != 0)) - { - isMissingValue = false; - break; - } - } - - if (!isMissingValue && columnEffects[sampleN] == 0) - { - possibleUnquantifiableSample.Add(group.Key + "_" + sample.Key); - } - - sampleN++; - } - } - - // set the sample protein intensities - sampleN = 0; - foreach (var group in SpectraFiles.GroupBy(p => p.Condition).OrderBy(p => p.Key)) - { - foreach (var sample in group.GroupBy(p => p.BiologicalReplicate).OrderBy(p => p.Key)) - { - // this step un-logs the protein "intensity". in reality this value is more like a fold-change - // than an intensity, but unlike a fold-change it's not relative to a particular sample. - // by multiplying this value by the reference protein intensity calculated earlier, then we get - // a protein intensity value - double columnEffect = columnEffects[sampleN]; - double sampleProteinIntensity = Math.Pow(2, columnEffect) * referenceProteinIntensity; - - // the column effect can be 0 in some cases. sometimes it's a valid value and sometimes it's not. - // so we need to check to see if it is actually a valid value - bool isMissingValue = true; - - foreach (SpectraFileInfo spectraFile in sample) - { - if (peptidesForThisProtein.Any(p => p.GetIntensity(spectraFile) != 0)) - { - isMissingValue = false; - break; - } - } - - if (!isMissingValue) - { - if (possibleUnquantifiableSample.Count > 1 && possibleUnquantifiableSample.Contains(group.Key + "_" + sample.Key)) - { - proteinGroup.SetIntensity(sample.First(), double.NaN); - } - else - { - proteinGroup.SetIntensity(sample.First(), sampleProteinIntensity); - } - } - - sampleN++; - } - } - } - } - } - - public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, string proteinOutputPath, string bayesianProteinQuantOutput, bool silent) - { - if (!silent) - { - Console.WriteLine("Writing output..."); - } - - if (peaksOutputPath != null) - { - using (StreamWriter output = new StreamWriter(peaksOutputPath)) - { - output.WriteLine(ChromatographicPeak.TabSeparatedHeader); - - foreach (var peak in Peaks.SelectMany(p => p.Value) - .OrderBy(p => p.SpectraFileInfo.FilenameWithoutExtension) - .ThenByDescending(p => p.Intensity)) - { - output.WriteLine(peak.ToString()); - } - } - } - - if (modPeptideOutputPath != null) - { - using (StreamWriter output = new StreamWriter(modPeptideOutputPath)) - { - if (IsoTracker) - { - output.WriteLine(Peptide.TabSeparatedHeader_IsoTracker(SpectraFiles)); - // we want to output with same iso group index followed by peak order. - foreach (var peptide in PeptideModifiedSequences - .OrderBy(p => p.Value.IsoGroupIndex ?? int.MaxValue) - .ThenBy(p => p.Value.PeakOrder ?? int.MinValue)) - { - output.WriteLine(peptide.Value.ToString(SpectraFiles, IsoTracker)); - } - } - else - { - output.WriteLine(Peptide.TabSeparatedHeader(SpectraFiles)); - foreach (var peptide in PeptideModifiedSequences.OrderBy(p => p.Key)) - { - output.WriteLine(peptide.Value.ToString(SpectraFiles, IsoTracker)); - } - } - } - } - - if (proteinOutputPath != null) - { - using (StreamWriter output = new StreamWriter(proteinOutputPath)) - { - output.WriteLine(ProteinGroup.TabSeparatedHeader(SpectraFiles)); - - foreach (var protein in ProteinGroups.OrderBy(p => p.Key)) - { - output.WriteLine(protein.Value.ToString(SpectraFiles)); - } - } - } - - if (bayesianProteinQuantOutput != null) - { - StringBuilder header = new StringBuilder(); - StringBuilder[] proteinStringBuilders = new StringBuilder[ProteinGroups.Count]; - - for (int i = 0; i < proteinStringBuilders.Length; i++) - { - proteinStringBuilders[i] = new StringBuilder(); - } - - using (StreamWriter output = new StreamWriter(bayesianProteinQuantOutput)) - { - if (!ProteinGroups.Any()) - { - return; - } - - var firstProteinQuantResults = ProteinGroups.First().Value.ConditionToQuantificationResults; - - if (!firstProteinQuantResults.Any()) - { - return; - } - - string tabSepHeader = null; - - if (firstProteinQuantResults.First().Value is PairedProteinQuantResult) - { - tabSepHeader = PairedProteinQuantResult.TabSeparatedHeader(); - } - else - { - tabSepHeader = UnpairedProteinQuantResult.TabSeparatedHeader(); - } - - foreach (var condition in firstProteinQuantResults.Keys) - { - header.Append(tabSepHeader); - - int p = 0; - - // sort by protein false discovery rate, then by number of measurements - foreach (var protein in ProteinGroups - .OrderByDescending(v => v.Value.ConditionToQuantificationResults[condition].IsStatisticallyValid) - .ThenByDescending(v => v.Value.ConditionToQuantificationResults[condition].BayesFactor) - .ThenByDescending(v => v.Value.ConditionToQuantificationResults[condition].Peptides.Count)) - { - proteinStringBuilders[p].Append( - protein.Value.ConditionToQuantificationResults[condition].ToString()); - - p++; - } - } - - output.WriteLine(header); - - foreach (var proteinStringBuilder in proteinStringBuilders) - { - output.WriteLine(proteinStringBuilder); - } - } - } - - if (!silent) - { - Console.WriteLine("Finished writing output"); - } - } - - public static void MedianPolish(double[][] table, int maxIterations = 10, double improvementCutoff = 0.0001) - { - // technically, this is weighted mean polish and not median polish. - // but it should give similar results while being more robust to issues - // arising from missing values. - // the weights are inverse square difference to median. - - // subtract overall effect - List allValues = table.SelectMany(p => p.Where(p => !double.IsNaN(p) && p != 0)).ToList(); - - if (allValues.Any()) - { - double overallEffect = allValues.Median(); - table[0][0] += overallEffect; - - for (int r = 1; r < table.Length; r++) - { - for (int c = 1; c < table[0].Length; c++) - { - table[r][c] -= overallEffect; - } - } - } - - double sumAbsoluteResiduals = double.MaxValue; - - for (int i = 0; i < maxIterations; i++) - { - // subtract row effects - for (int r = 0; r < table.Length; r++) - { - List rowValues = table[r].Skip(1).Where(p => !double.IsNaN(p)).ToList(); - - if (rowValues.Any()) - { - double rowMedian = rowValues.Median(); - double[] weights = rowValues.Select(p => 1.0 / Math.Max(0.0001, Math.Pow(p - rowMedian, 2))).ToArray(); - double rowEffect = rowValues.Sum(p => p * weights[rowValues.IndexOf(p)]) / weights.Sum(); - table[r][0] += rowEffect; - - for (int c = 1; c < table[0].Length; c++) - { - table[r][c] -= rowEffect; - } - } - } - - // subtract column effects - for (int c = 0; c < table[0].Length; c++) - { - List colValues = table.Skip(1).Select(p => p[c]).Where(p => !double.IsNaN(p)).ToList(); - - if (colValues.Any()) - { - double colMedian = colValues.Median(); - double[] weights = colValues.Select(p => 1.0 / Math.Max(0.0001, Math.Pow(p - colMedian, 2))).ToArray(); - double colEffect = colValues.Sum(p => p * weights[colValues.IndexOf(p)]) / weights.Sum(); - table[0][c] += colEffect; - - for (int r = 1; r < table.Length; r++) - { - table[r][c] -= colEffect; - } - } - } - - // calculate sum of absolute residuals and end the algorithm if it is not improving - double iterationSumAbsoluteResiduals = table.Skip(1).SelectMany(p => p.Skip(1)).Where(p => !double.IsNaN(p)).Sum(p => Math.Abs(p)); - - if (Math.Abs((iterationSumAbsoluteResiduals - sumAbsoluteResiduals) / sumAbsoluteResiduals) < improvementCutoff) - { - break; - } - - sumAbsoluteResiduals = iterationSumAbsoluteResiduals; - } + var strategy = new MedianPolishProteinQuantificationStrategy(); + var engine = new GenericProteinQuantificationEngine( + strategy, + PeptideModifiedSequences, + ProteinGroups, + SpectraFiles); + + engine.Run(useSharedPeptides); } /// diff --git a/mzLib/FlashLFQ/IntensityNormalizationEngine.cs b/mzLib/FlashLFQ/IntensityNormalizationEngine.cs index 60834c66c..4b0845a88 100644 --- a/mzLib/FlashLFQ/IntensityNormalizationEngine.cs +++ b/mzLib/FlashLFQ/IntensityNormalizationEngine.cs @@ -9,20 +9,16 @@ namespace FlashLFQ { public class IntensityNormalizationEngine { - private const int numPeptidesDesiredFromEachFraction = 500; - private const int numPeptidesDesiredInMatrix = 5000; private readonly FlashLfqResults results; private readonly bool integrate; private readonly bool silent; private readonly bool quantifyAmbiguousPeptides; - private readonly int maxThreads; public IntensityNormalizationEngine(FlashLfqResults results, bool integrate, bool silent, int maxThreads, bool quantifyAmbiguousPeptides = false) { this.results = results; this.integrate = integrate; this.silent = silent; - this.maxThreads = maxThreads; } /// @@ -317,72 +313,6 @@ private void NormalizeBioreps() } } - /// - /// This method takes a list of peptides and creates a subset list of peptides to normalize with, to avoid - /// excessive computation time in normalization functions. - /// - //private List SubsetData(List initialList, List spectraFiles) - //{ - // List[] bothBioreps = new List[2]; - // var temp1 = spectraFiles.GroupBy(p => p.Condition).ToList(); - // if (temp1.Count() == 1) - // { - // // normalizing bioreps within a condition - // var temp2 = spectraFiles.GroupBy(p => p.BiologicalReplicate).ToList(); - // bothBioreps[0] = temp2[0].ToList(); - // bothBioreps[1] = temp2[1].ToList(); - // } - // else - // { - // // normalizing bioreps between conditions - // bothBioreps[0] = temp1[0].ToList(); - // bothBioreps[1] = temp1[1].ToList(); - // } - - // HashSet subsetList = new HashSet(); - // int maxFractionIndex = bothBioreps.SelectMany(p => p).Max(v => v.Fraction); - - // foreach (var biorep in bothBioreps) - // { - // List fractions = biorep.Select(p => p.Fraction).Distinct().ToList(); - - // int numToAddPerFraction = numPeptidesDesiredInMatrix / fractions.Count; - // if (numToAddPerFraction < numPeptidesDesiredFromEachFraction) - // { - // numToAddPerFraction = numPeptidesDesiredFromEachFraction; - // } - - // int[] peptidesAddedPerFraction = new int[fractions.Count]; - // Queue[] peptidesObservedInEachFraction = new Queue[fractions.Count]; - - // foreach (var file in biorep) - // { - // if (peptidesObservedInEachFraction[file.Fraction] == null) - // { - // peptidesObservedInEachFraction[file.Fraction] = new Queue(initialList.Where(p => p.GetIntensity(file) > 0) - // .OrderByDescending(p => p.GetIntensity(file))); - // } - // } - - // foreach (var fraction in fractions) - // { - // while (peptidesAddedPerFraction[fraction] < numToAddPerFraction && peptidesObservedInEachFraction[fraction].Any()) - // { - // var peptide = peptidesObservedInEachFraction[fraction].Dequeue(); - - // // don't need to check if the return list already contains the peptide because it's a HashSet (no duplicates are allowed) - // subsetList.Add(peptide); - - // // this peptide is in the return list regardless of whether or not it was actually just added; - // // we just want to guarantee this fraction has 500 peptides in the return list to normalize with - // peptidesAddedPerFraction[fraction]++; - // } - // } - // } - - // return subsetList.ToList(); - //} - /// /// Calculates normalization factors for fractionated data using a bounded Nelder-Mead optimization algorithm. /// Called by NormalizeFractions(). diff --git a/mzLib/FlashLFQ/Interfaces/IQuantifiablePeptide.cs b/mzLib/FlashLFQ/Interfaces/IQuantifiablePeptide.cs new file mode 100644 index 000000000..74d85bedb --- /dev/null +++ b/mzLib/FlashLFQ/Interfaces/IQuantifiablePeptide.cs @@ -0,0 +1,40 @@ +using System.Collections.Generic; + +namespace FlashLFQ +{ + /// + /// Defines the interface for peptides that can be quantified + /// + public interface IQuantifiablePeptide + { + /// + /// The peptide sequence + /// + string Sequence { get; } + + /// + /// Whether this peptide should be used for protein quantification + /// + bool UseForProteinQuant { get; } + + /// + /// The protein groups this peptide belongs to + /// + IEnumerable ProteinGroups { get; } + + /// + /// Gets the intensity for a specific file + /// + double GetIntensity(IQuantifiableSpectraFile fileInfo); + + /// + /// Gets the detection type for a specific file + /// + DetectionType GetDetectionType(IQuantifiableSpectraFile fileInfo); + + /// + /// Determines if this peptide has unambiguous quantification + /// + bool UnambiguousPeptideQuant(); + } +} diff --git a/mzLib/FlashLFQ/Interfaces/IQuantifiableProteinGroup.cs b/mzLib/FlashLFQ/Interfaces/IQuantifiableProteinGroup.cs new file mode 100644 index 000000000..00ec8610f --- /dev/null +++ b/mzLib/FlashLFQ/Interfaces/IQuantifiableProteinGroup.cs @@ -0,0 +1,25 @@ +using System.Collections.Generic; + +namespace FlashLFQ +{ + /// + /// Defines the interface for protein groups that can be quantified + /// + public interface IQuantifiableProteinGroup + { + /// + /// The protein group name/identifier + /// + string ProteinGroupName { get; } + + /// + /// Gets the intensity for a specific file + /// + double GetIntensity(IQuantifiableSpectraFile fileInfo); + + /// + /// Sets the intensity for a specific file + /// + void SetIntensity(IQuantifiableSpectraFile fileInfo, double intensity); + } +} diff --git a/mzLib/FlashLFQ/Interfaces/IQuantifiableSpectraFile.cs b/mzLib/FlashLFQ/Interfaces/IQuantifiableSpectraFile.cs new file mode 100644 index 000000000..ba3de41c5 --- /dev/null +++ b/mzLib/FlashLFQ/Interfaces/IQuantifiableSpectraFile.cs @@ -0,0 +1,33 @@ +namespace FlashLFQ +{ + /// + /// Defines the interface for spectra file information + /// + public interface IQuantifiableSpectraFile + { + /// + /// The filename without extension + /// + string FilenameWithoutExtension { get; } + + /// + /// The experimental condition + /// + string Condition { get; } + + /// + /// The biological replicate number + /// + int BiologicalReplicate { get; } + + /// + /// The technical replicate number + /// + int TechnicalReplicate { get; } + + /// + /// The fraction number + /// + int Fraction { get; } + } +} diff --git a/mzLib/FlashLFQ/Peptide.cs b/mzLib/FlashLFQ/Peptide.cs index 30a0c2550..36094e544 100644 --- a/mzLib/FlashLFQ/Peptide.cs +++ b/mzLib/FlashLFQ/Peptide.cs @@ -4,7 +4,7 @@ namespace FlashLFQ { - public class Peptide + public class Peptide : IQuantifiablePeptide { public readonly string Sequence; public readonly string BaseSequence; @@ -289,5 +289,30 @@ public bool UnambiguousPeptideQuant() { return Intensities.Values.Any(p => p > 0) && DetectionTypes.Values.Any(p => p != DetectionType.MSMSAmbiguousPeakfinding); } + + // Explicit interface implementations for IQuantifiablePeptide + string IQuantifiablePeptide.Sequence => Sequence; + bool IQuantifiablePeptide.UseForProteinQuant => UseForProteinQuant; + IEnumerable IQuantifiablePeptide.ProteinGroups => ProteinGroups.Cast(); + + double IQuantifiablePeptide.GetIntensity(IQuantifiableSpectraFile fileInfo) + { + if (fileInfo is SpectraFileInfo spectraFileInfo) + { + return GetIntensity(spectraFileInfo); + } + throw new System.ArgumentException("File info must be of type SpectraFileInfo"); + } + + DetectionType IQuantifiablePeptide.GetDetectionType(IQuantifiableSpectraFile fileInfo) + { + if (fileInfo is SpectraFileInfo spectraFileInfo) + { + return GetDetectionType(spectraFileInfo); + } + throw new System.ArgumentException("File info must be of type SpectraFileInfo"); + } + + bool IQuantifiablePeptide.UnambiguousPeptideQuant() => UnambiguousPeptideQuant(); } } \ No newline at end of file diff --git a/mzLib/FlashLFQ/ProteinGroup.cs b/mzLib/FlashLFQ/ProteinGroup.cs index de87c4539..823d7d3bd 100644 --- a/mzLib/FlashLFQ/ProteinGroup.cs +++ b/mzLib/FlashLFQ/ProteinGroup.cs @@ -4,7 +4,7 @@ namespace FlashLFQ { - public class ProteinGroup + public class ProteinGroup : IQuantifiableProteinGroup { public readonly string ProteinGroupName; public readonly string GeneName; @@ -111,5 +111,29 @@ public override int GetHashCode() { return ProteinGroupName.GetHashCode(); } + + // Explicit interface implementation for IQuantifiableProteinGroup + string IQuantifiableProteinGroup.ProteinGroupName => ProteinGroupName; + + double IQuantifiableProteinGroup.GetIntensity(IQuantifiableSpectraFile fileInfo) + { + if (fileInfo is SpectraFileInfo spectraFileInfo) + { + return GetIntensity(spectraFileInfo); + } + throw new System.ArgumentException("File info must be of type SpectraFileInfo"); + } + + void IQuantifiableProteinGroup.SetIntensity(IQuantifiableSpectraFile fileInfo, double intensity) + { + if (fileInfo is SpectraFileInfo spectraFileInfo) + { + SetIntensity(spectraFileInfo, intensity); + } + else + { + throw new System.ArgumentException("File info must be of type SpectraFileInfo"); + } + } } } \ No newline at end of file diff --git a/mzLib/FlashLFQ/ProteinQuantification/GenericProteinQuantificationEngine.cs b/mzLib/FlashLFQ/ProteinQuantification/GenericProteinQuantificationEngine.cs new file mode 100644 index 000000000..0ad1f8336 --- /dev/null +++ b/mzLib/FlashLFQ/ProteinQuantification/GenericProteinQuantificationEngine.cs @@ -0,0 +1,107 @@ +using Easy.Common.Extensions; +using MathNet.Numerics.Statistics; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace FlashLFQ +{ + /// + /// Defines the strategy interface for protein quantification algorithms + /// + public interface IProteinQuantificationStrategy + where TPeptide : IQuantifiablePeptide + where TProteinGroup : IQuantifiableProteinGroup + where TSpectraFile : IQuantifiableSpectraFile + { + /// + /// Quantifies proteins based on their constituent peptides + /// + void QuantifyProteins( + Dictionary> proteinGroupToPeptides, + List spectraFiles, + bool useSharedPeptides); + } + + /// + /// Generic protein quantification engine that uses a strategy pattern + /// to support different quantification algorithms (Top3, MedianPolish, etc.) + /// + public class GenericProteinQuantificationEngine + where TPeptide : IQuantifiablePeptide + where TProteinGroup : IQuantifiableProteinGroup + where TSpectraFile : IQuantifiableSpectraFile + { + private readonly IProteinQuantificationStrategy _strategy; + private readonly Dictionary _peptideModifiedSequences; + private readonly Dictionary _proteinGroups; + private readonly List _spectraFiles; + + public GenericProteinQuantificationEngine( + IProteinQuantificationStrategy strategy, + Dictionary peptideModifiedSequences, + Dictionary proteinGroups, + List spectraFiles) + { + _strategy = strategy ?? throw new ArgumentNullException(nameof(strategy)); + _peptideModifiedSequences = peptideModifiedSequences ?? throw new ArgumentNullException(nameof(peptideModifiedSequences)); + _proteinGroups = proteinGroups ?? throw new ArgumentNullException(nameof(proteinGroups)); + _spectraFiles = spectraFiles ?? throw new ArgumentNullException(nameof(spectraFiles)); + } + + /// + /// Runs the protein quantification using the configured strategy + /// + public void Run(bool useSharedPeptides = false) + { + // Reset protein intensities to 0 + foreach (var proteinGroup in _proteinGroups) + { + foreach (TSpectraFile file in _spectraFiles) + { + proteinGroup.Value.SetIntensity(file, 0); + } + } + + // Associate peptides with proteins + Dictionary> proteinGroupToPeptides = + new Dictionary>(); + + List peptides = _peptideModifiedSequences.Values + .Where(p => p.UnambiguousPeptideQuant()) + .ToList(); + + foreach (TPeptide peptide in peptides) + { + if (!peptide.UseForProteinQuant) + { + continue; + } + + // Handle shared peptides based on useSharedPeptides parameter + bool isSharedPeptide = peptide.ProteinGroups.Count() > 1; + if (!useSharedPeptides && isSharedPeptide) + { + continue; + } + + foreach (var pg in peptide.ProteinGroups) + { + TProteinGroup proteinGroup = (TProteinGroup)pg; + + if (proteinGroupToPeptides.TryGetValue(proteinGroup, out var peptidesForThisProtein)) + { + peptidesForThisProtein.Add(peptide); + } + else + { + proteinGroupToPeptides.Add(proteinGroup, new List { peptide }); + } + } + } + + // Execute the quantification strategy + _strategy.QuantifyProteins(proteinGroupToPeptides, _spectraFiles, useSharedPeptides); + } + } +} diff --git a/mzLib/FlashLFQ/ProteinQuantification/MedianPolishProteinQuantificationStrategy.cs b/mzLib/FlashLFQ/ProteinQuantification/MedianPolishProteinQuantificationStrategy.cs new file mode 100644 index 000000000..47820f03d --- /dev/null +++ b/mzLib/FlashLFQ/ProteinQuantification/MedianPolishProteinQuantificationStrategy.cs @@ -0,0 +1,330 @@ +using Easy.Common.Extensions; +using MathNet.Numerics.Statistics; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace FlashLFQ +{ + /// + /// Median polish protein quantification strategy: uses the median polish algorithm + /// to calculate protein quantities in each biological replicate + /// See https://mgimond.github.io/ES218/Week11a.html for an example of the median polish algorithm. + /// + public class MedianPolishProteinQuantificationStrategy + : IProteinQuantificationStrategy + where TPeptide : IQuantifiablePeptide + where TProteinGroup : IQuantifiableProteinGroup + where TSpectraFile : IQuantifiableSpectraFile + { + private readonly int _maxIterations; + private readonly double _improvementCutoff; + + public MedianPolishProteinQuantificationStrategy( + int maxIterations = 10, + double improvementCutoff = 0.0001) + { + _maxIterations = maxIterations; + _improvementCutoff = improvementCutoff; + } + + public void QuantifyProteins( + Dictionary> proteinGroupToPeptides, + List spectraFiles, + bool useSharedPeptides) + { + // Quantify each protein + foreach (var kvp in proteinGroupToPeptides) + { + TProteinGroup proteinGroup = kvp.Key; + List peptidesForThisProtein = kvp.Value; + + // Set up peptide intensity table + int numSamples = spectraFiles + .Select(p => p.Condition + p.BiologicalReplicate) + .Distinct() + .Count(); + + double[][] peptideIntensityMatrix = new double[peptidesForThisProtein.Count + 1][]; + for (int i = 0; i < peptideIntensityMatrix.Length; i++) + { + peptideIntensityMatrix[i] = new double[numSamples + 1]; + } + + // Populate matrix with log2-transformed peptide intensities + int sampleN = 0; + foreach (var group in spectraFiles.GroupBy(p => p.Condition).OrderBy(p => p.Key)) + { + foreach (var sample in group.GroupBy(p => p.BiologicalReplicate).OrderBy(p => p.Key)) + { + foreach (TPeptide peptide in peptidesForThisProtein) + { + double sampleIntensity = CalculateSampleIntensity(peptide, sample.ToList()); + + if (sampleIntensity == 0) + { + sampleIntensity = double.NaN; + } + else + { + sampleIntensity = Math.Log(sampleIntensity, 2); + } + + peptideIntensityMatrix[peptidesForThisProtein.IndexOf(peptide) + 1][sampleN + 1] = sampleIntensity; + } + + sampleN++; + } + } + + // Filter out peptides with only one measurement (unless all peptides have one measurement) + FilterSingleMeasurementPeptides(peptideIntensityMatrix, numSamples); + + // Do median polish protein quantification + MedianPolish(peptideIntensityMatrix, _maxIterations, _improvementCutoff); + + // Extract results and set protein intensities + double overallEffect = peptideIntensityMatrix[0][0]; + double[] columnEffects = peptideIntensityMatrix[0].Skip(1).ToArray(); + double referenceProteinIntensity = Math.Pow(2, overallEffect) * peptidesForThisProtein.Count; + + // Check for unquantifiable proteins + List possibleUnquantifiableSample = IdentifyUnquantifiableProteins( + spectraFiles, peptidesForThisProtein, columnEffects); + + // Set the sample protein intensities + SetProteinIntensities( + proteinGroup, + spectraFiles, + peptidesForThisProtein, + columnEffects, + referenceProteinIntensity, + possibleUnquantifiableSample); + } + } + + private double CalculateSampleIntensity(TPeptide peptide, List sampleFiles) + { + double sampleIntensity = 0; + double highestFractionIntensity = 0; + + // The fraction with the highest intensity is used as the sample intensity + foreach (var fraction in sampleFiles.GroupBy(p => p.Fraction)) + { + double fractionIntensity = 0; + int replicatesWithValidValues = 0; + + foreach (TSpectraFile replicate in fraction.OrderBy(p => p.TechnicalReplicate)) + { + double replicateIntensity = peptide.GetIntensity(replicate); + + if (replicateIntensity > 0) + { + fractionIntensity += replicateIntensity; + replicatesWithValidValues++; + } + } + + if (replicatesWithValidValues > 0) + { + fractionIntensity /= replicatesWithValidValues; + } + + if (fractionIntensity > highestFractionIntensity) + { + highestFractionIntensity = fractionIntensity; + sampleIntensity = highestFractionIntensity; + } + } + + return sampleIntensity; + } + + private void FilterSingleMeasurementPeptides(double[][] peptideIntensityMatrix, int numSamples) + { + var peptidesWithMoreThanOneMmt = peptideIntensityMatrix + .Skip(1) + .Count(row => row.Skip(1).Count(cell => !double.IsNaN(cell)) > 1); + + if (peptidesWithMoreThanOneMmt > 0) + { + for (int i = 1; i < peptideIntensityMatrix.Length; i++) + { + int validValueCount = peptideIntensityMatrix[i] + .Count(p => !double.IsNaN(p) && p != 0); + + if (validValueCount < 2 && numSamples >= 2) + { + for (int j = 1; j < peptideIntensityMatrix[0].Length; j++) + { + peptideIntensityMatrix[i][j] = double.NaN; + } + } + } + } + } + + private List IdentifyUnquantifiableProteins( + List spectraFiles, + List peptidesForThisProtein, + double[] columnEffects) + { + List possibleUnquantifiableSample = new List(); + int sampleN = 0; + + foreach (var group in spectraFiles.GroupBy(p => p.Condition).OrderBy(p => p.Key)) + { + foreach (var sample in group.GroupBy(p => p.BiologicalReplicate).OrderBy(p => p.Key)) + { + bool isMissingValue = true; + + foreach (TSpectraFile spectraFile in sample) + { + if (peptidesForThisProtein.Any(p => p.GetIntensity(spectraFile) != 0)) + { + isMissingValue = false; + break; + } + } + + if (!isMissingValue && columnEffects[sampleN] == 0) + { + possibleUnquantifiableSample.Add(group.Key + "_" + sample.Key); + } + + sampleN++; + } + } + + return possibleUnquantifiableSample; + } + + private void SetProteinIntensities( + TProteinGroup proteinGroup, + List spectraFiles, + List peptidesForThisProtein, + double[] columnEffects, + double referenceProteinIntensity, + List possibleUnquantifiableSample) + { + int sampleN = 0; + + foreach (var group in spectraFiles.GroupBy(p => p.Condition).OrderBy(p => p.Key)) + { + foreach (var sample in group.GroupBy(p => p.BiologicalReplicate).OrderBy(p => p.Key)) + { + double columnEffect = columnEffects[sampleN]; + double sampleProteinIntensity = Math.Pow(2, columnEffect) * referenceProteinIntensity; + + // Check if this is a valid value + bool isMissingValue = true; + + foreach (TSpectraFile spectraFile in sample) + { + if (peptidesForThisProtein.Any(p => p.GetIntensity(spectraFile) != 0)) + { + isMissingValue = false; + break; + } + } + + if (!isMissingValue) + { + if (possibleUnquantifiableSample.Count > 1 && + possibleUnquantifiableSample.Contains(group.Key + "_" + sample.Key)) + { + proteinGroup.SetIntensity(sample.First(), double.NaN); + } + else + { + proteinGroup.SetIntensity(sample.First(), sampleProteinIntensity); + } + } + + sampleN++; + } + } + } + + /// + /// Performs the median polish algorithm on a data table + /// Technically, this is weighted mean polish and not median polish, but it gives similar results + /// while being more robust to issues arising from missing values. + /// + public static void MedianPolish(double[][] table, int maxIterations = 10, double improvementCutoff = 0.0001) + { + // Subtract overall effect + List allValues = table.SelectMany(p => p.Where(p => !double.IsNaN(p) && p != 0)).ToList(); + + if (allValues.Any()) + { + double overallEffect = allValues.Median(); + table[0][0] += overallEffect; + + for (int r = 1; r < table.Length; r++) + { + for (int c = 1; c < table[0].Length; c++) + { + table[r][c] -= overallEffect; + } + } + } + + double sumAbsoluteResiduals = double.MaxValue; + + for (int i = 0; i < maxIterations; i++) + { + // Subtract row effects + for (int r = 0; r < table.Length; r++) + { + List rowValues = table[r].Skip(1).Where(p => !double.IsNaN(p)).ToList(); + + if (rowValues.Any()) + { + double rowMedian = rowValues.Median(); + double[] weights = rowValues.Select(p => 1.0 / Math.Max(0.0001, Math.Pow(p - rowMedian, 2))).ToArray(); + double rowEffect = rowValues.Sum(p => p * weights[rowValues.IndexOf(p)]) / weights.Sum(); + table[r][0] += rowEffect; + + for (int c = 1; c < table[0].Length; c++) + { + table[r][c] -= rowEffect; + } + } + } + + // Subtract column effects + for (int c = 0; c < table[0].Length; c++) + { + List colValues = table.Skip(1).Select(p => p[c]).Where(p => !double.IsNaN(p)).ToList(); + + if (colValues.Any()) + { + double colMedian = colValues.Median(); + double[] weights = colValues.Select(p => 1.0 / Math.Max(0.0001, Math.Pow(p - colMedian, 2))).ToArray(); + double colEffect = colValues.Sum(p => p * weights[colValues.IndexOf(p)]) / weights.Sum(); + table[0][c] += colEffect; + + for (int r = 1; r < table.Length; r++) + { + table[r][c] -= colEffect; + } + } + } + + // Calculate sum of absolute residuals and end if not improving + double iterationSumAbsoluteResiduals = table.Skip(1) + .SelectMany(p => p.Skip(1)) + .Where(p => !double.IsNaN(p)) + .Sum(p => Math.Abs(p)); + + if (Math.Abs((iterationSumAbsoluteResiduals - sumAbsoluteResiduals) / sumAbsoluteResiduals) < improvementCutoff) + { + break; + } + + sumAbsoluteResiduals = iterationSumAbsoluteResiduals; + } + } + } +} diff --git a/mzLib/FlashLFQ/ProteinQuantification/Top3ProteinQuantificationStrategy.cs b/mzLib/FlashLFQ/ProteinQuantification/Top3ProteinQuantificationStrategy.cs new file mode 100644 index 000000000..da7ba5e61 --- /dev/null +++ b/mzLib/FlashLFQ/ProteinQuantification/Top3ProteinQuantificationStrategy.cs @@ -0,0 +1,48 @@ +using System.Collections.Generic; +using System.Linq; + +namespace FlashLFQ +{ + /// + /// Top3 protein quantification strategy: protein intensity is calculated as the sum of + /// the top 3 peptide intensities in each file + /// + public class Top3ProteinQuantificationStrategy + : IProteinQuantificationStrategy + where TPeptide : IQuantifiablePeptide + where TProteinGroup : IQuantifiableProteinGroup + where TSpectraFile : IQuantifiableSpectraFile + { + private readonly int _topN; + + public Top3ProteinQuantificationStrategy(int topN = 3) + { + _topN = topN; + } + + public void QuantifyProteins( + Dictionary> proteinGroupToPeptides, + List spectraFiles, + bool useSharedPeptides) + { + foreach (var kvp in proteinGroupToPeptides) + { + TProteinGroup proteinGroup = kvp.Key; + List peptidesForThisProtein = kvp.Value; + + foreach (TSpectraFile file in spectraFiles) + { + // Get top N peptides by intensity in this file + double proteinIntensity = peptidesForThisProtein + .Where(p => p.ProteinGroups.Count() == 1 || useSharedPeptides) + .Select(p => p.GetIntensity(file)) + .OrderByDescending(intensity => intensity) + .Take(_topN) + .Sum(); + + proteinGroup.SetIntensity(file, proteinIntensity); + } + } + } + } +} diff --git a/mzLib/FlashLFQ/SpectraFileInfo.cs b/mzLib/FlashLFQ/SpectraFileInfo.cs index 9cc24b6d7..c74541cd0 100644 --- a/mzLib/FlashLFQ/SpectraFileInfo.cs +++ b/mzLib/FlashLFQ/SpectraFileInfo.cs @@ -2,7 +2,7 @@ namespace FlashLFQ { - public class SpectraFileInfo + public class SpectraFileInfo : IQuantifiableSpectraFile { /// /// The path to the data file (e.g., a .raw file) with the extension diff --git a/mzLib/Test/FlashLFQ/TestFlashLFQ.cs b/mzLib/Test/FlashLFQ/TestFlashLFQ.cs index ccc614be9..8b2c9b1d8 100644 --- a/mzLib/Test/FlashLFQ/TestFlashLFQ.cs +++ b/mzLib/Test/FlashLFQ/TestFlashLFQ.cs @@ -1407,7 +1407,7 @@ public static IEnumerable MedianPolishTestCases() [Test, TestCaseSource("MedianPolishTestCases")] public static void TestMedianPolishWithIntensity(double[][] intensityArray, double[] expectedRowEffects, double[] expectedColumnEffects, double expectedOverallEffect) { - FlashLfqResults.MedianPolish(intensityArray); + MedianPolishProteinQuantificationStrategy.MedianPolish(intensityArray); var rowEffects = intensityArray.Select(p => p[0]).Skip(1).ToArray(); var columnEffects = intensityArray[0].Skip(1).ToArray(); var overallEffect = intensityArray[0][0]; @@ -1428,7 +1428,7 @@ public static void TestMedianPolish() new double[] { 0, 7, 8 } }; - FlashLfqResults.MedianPolish(array2D); + MedianPolishProteinQuantificationStrategy.MedianPolish(array2D); var rowEffects = array2D.Select(p => p[0]).Skip(1).ToArray(); var columnEffects = array2D[0].Skip(1).ToArray(); var overallEffect = array2D[0][0];