From 5276f422000f21698189447d5df319ff2c8153fe Mon Sep 17 00:00:00 2001 From: Mark Tomko Date: Thu, 8 Feb 2024 10:25:13 -0800 Subject: [PATCH 01/12] Rename parameter --- .../org/broadinstitute/gpp/poolq3/reports/QualityWriter.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/reports/QualityWriter.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/reports/QualityWriter.scala index 987eac7..3c5c654 100644 --- a/src/main/scala/org/broadinstitute/gpp/poolq3/reports/QualityWriter.scala +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/reports/QualityWriter.scala @@ -17,13 +17,13 @@ import org.broadinstitute.gpp.poolq3.reference.Reference object QualityWriter { def write( - file: Path, + qualityFile: Path, state: State, rowReference: Reference, colReference: Reference, isPairedEnd: Boolean ): Try[Unit] = - Using(new PrintWriter(file.toFile)) { writer => + Using(new PrintWriter(qualityFile.toFile)) { writer => val barcodeLocationStats = if (isPairedEnd) { s"""Reads with no construct barcode: ${state.rowBarcodeNotFound + state.revRowBarcodeNotFound - state.neitherRowBarcodeFound} From dfb12ee6c5d5fd00ea6e383f07bc019b11362400 Mon Sep 17 00:00:00 2001 From: Mark Tomko Date: Thu, 8 Feb 2024 14:59:19 -0800 Subject: [PATCH 02/12] Add output file type --- .../org/broadinstitute/gpp/poolq3/types/OutputFileType.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/types/OutputFileType.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/types/OutputFileType.scala index e4dfaae..3dfccec 100644 --- a/src/main/scala/org/broadinstitute/gpp/poolq3/types/OutputFileType.scala +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/types/OutputFileType.scala @@ -8,6 +8,7 @@ package org.broadinstitute.gpp.poolq3.types trait OutputFileType extends Product with Serializable case object CountsFileType extends OutputFileType case object QualityFileType extends OutputFileType +case object ConditionBarcodeCountsSummaryFileType extends OutputFileType case object LogNormalizedCountsFileType extends OutputFileType case object BarcodeCountsFileType extends OutputFileType case object CorrelationFileType extends OutputFileType From b4b3045e55e68c33610328d09a52b14950b0975e Mon Sep 17 00:00:00 2001 From: Mark Tomko Date: Thu, 8 Feb 2024 14:59:31 -0800 Subject: [PATCH 03/12] case class hygiene --- .../org/broadinstitute/gpp/poolq3/types/PoolQSummary.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/types/PoolQSummary.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/types/PoolQSummary.scala index 03832a7..33fad18 100644 --- a/src/main/scala/org/broadinstitute/gpp/poolq3/types/PoolQSummary.scala +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/types/PoolQSummary.scala @@ -5,4 +5,4 @@ */ package org.broadinstitute.gpp.poolq3.types -case class PoolQSummary(runSummary: PoolQRunSummary, outputFiles: Set[OutputFileType]) +final case class PoolQSummary(runSummary: PoolQRunSummary, outputFiles: Set[OutputFileType]) From b90369ba69b32a4935320ff029b8cfa868677942 Mon Sep 17 00:00:00 2001 From: Mark Tomko Date: Thu, 8 Feb 2024 15:00:01 -0800 Subject: [PATCH 04/12] Add output file to config --- src/main/scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala index 185f075..e3491ac 100644 --- a/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala @@ -74,6 +74,7 @@ final case class PoolQOutput( normalizedCountsFile: Path = Paths.get("lognormalized-counts.txt"), barcodeCountsFile: Path = Paths.get("barcode-counts.txt"), qualityFile: Path = Paths.get("quality.txt"), + conditionBarcodeCountsSummaryFile: Path = Paths.get("condition-barcode-counts-summary.txt"), correlationFile: Path = Paths.get("correlation.txt"), unexpectedSequencesFile: Path = Paths.get("unexpected-sequences.txt"), umiQualityFile: Path = Paths.get("umi-quality.txt"), From 56b2df09ba2ee5e1c89824fc1200ae93b345293e Mon Sep 17 00:00:00 2001 From: Mark Tomko Date: Thu, 8 Feb 2024 15:00:24 -0800 Subject: [PATCH 05/12] Write summary file --- .../org/broadinstitute/gpp/poolq3/PoolQ.scala | 19 ++++- .../gpp/poolq3/reports/QualityWriter.scala | 76 +++++++++++++------ 2 files changed, 68 insertions(+), 27 deletions(-) diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQ.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQ.scala index c5ae8b8..b4308a0 100644 --- a/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQ.scala +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQ.scala @@ -33,6 +33,7 @@ import org.broadinstitute.gpp.poolq3.reports.{ } import org.broadinstitute.gpp.poolq3.types.{ BarcodeCountsFileType, + ConditionBarcodeCountsSummaryFileType, CountsFileType, LogNormalizedCountsFileType, OutputFileType, @@ -49,7 +50,14 @@ object PoolQ { private[this] val log: Logger = getLogger private[this] val AlwaysWrittenFiles: Set[OutputFileType] = - Set(CountsFileType, QualityFileType, LogNormalizedCountsFileType, BarcodeCountsFileType, RunInfoFileType) + Set( + CountsFileType, + QualityFileType, + ConditionBarcodeCountsSummaryFileType, + LogNormalizedCountsFileType, + BarcodeCountsFileType, + RunInfoFileType + ) final def main(args: Array[String]): Unit = PoolQConfig.parse(args) match { @@ -169,7 +177,14 @@ object PoolQ { config.reportsDialect ) _ = log.info(s"Writing quality file ${config.output.qualityFile}") - _ <- QualityWriter.write(config.output.qualityFile, state, rowReference, colReference, config.isPairedEnd) + _ <- QualityWriter.write( + config.output.qualityFile, + config.output.conditionBarcodeCountsSummaryFile, + state, + rowReference, + colReference, + config.isPairedEnd + ) _ <- umiInfo.fold(().pure[Try])(_ => UmiQualityWriter.write(config.output.umiQualityFile, state)) _ = log.info(s"Writing log-normalized counts file ${config.output.normalizedCountsFile}") normalizedCounts = LogNormalizedCountsWriter.logNormalizedCounts(counts, rowReference, colReference) diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/reports/QualityWriter.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/reports/QualityWriter.scala index 3c5c654..ff050ab 100644 --- a/src/main/scala/org/broadinstitute/gpp/poolq3/reports/QualityWriter.scala +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/reports/QualityWriter.scala @@ -16,17 +16,39 @@ import org.broadinstitute.gpp.poolq3.reference.Reference object QualityWriter { + class TeeWriter(w1: PrintWriter, w2: PrintWriter) { + + def print(s: String): Unit = { + w1.print(s) + w2.print(s) + } + + def println(s: String): Unit = { + w1.println(s) + w2.println(s) + } + + def println(): Unit = { + w1.println() + w2.println() + } + + } + def write( qualityFile: Path, + conditionBarcodeCountsSummaryFile: Path, state: State, rowReference: Reference, colReference: Reference, isPairedEnd: Boolean ): Try[Unit] = - Using(new PrintWriter(qualityFile.toFile)) { writer => - val barcodeLocationStats = - if (isPairedEnd) { - s"""Reads with no construct barcode: ${state.rowBarcodeNotFound + state.revRowBarcodeNotFound - state.neitherRowBarcodeFound} + Try { + Using.resources(new PrintWriter(qualityFile.toFile), new PrintWriter(conditionBarcodeCountsSummaryFile.toFile)) { + case (qualityWriter, cbcsWriter) => + val barcodeLocationStats = + if (isPairedEnd) { + s"""Reads with no construct barcode: ${state.rowBarcodeNotFound + state.revRowBarcodeNotFound - state.neitherRowBarcodeFound} | |Reads with no forward construct barcode: ${state.rowBarcodeNotFound} |Max forward construct barcode index: ${state.rowBarcodeStats.maxPosStr} @@ -38,15 +60,15 @@ object QualityWriter { |Min reverse construct barcode index: ${state.revRowBarcodeStats.minPosStr} |Avg reverse construct barcode index: ${decOptFmt(state.revRowBarcodeStats.avg)}""".stripMargin - } else { - s"""Reads with no construct barcode: ${state.rowBarcodeNotFound} + } else { + s"""Reads with no construct barcode: ${state.rowBarcodeNotFound} |Max construct barcode index: ${state.rowBarcodeStats.maxPosStr} |Min construct barcode index: ${state.rowBarcodeStats.minPosStr} |Avg construct barcode index: ${decOptFmt(state.rowBarcodeStats.avg)}""".stripMargin - } + } - val header = - s"""Total reads: ${state.reads} + val header = + s"""Total reads: ${state.reads} |Matching reads: ${state.matches} |1-base mismatch reads: ${state.matches - state.exactMatches} | @@ -55,25 +77,29 @@ object QualityWriter { |$barcodeLocationStats |""".stripMargin - writer.println(header) + qualityWriter.println(header) - writer.println(s"Read counts for sample barcodes with associated conditions:") - writer.println( - s"Barcode\tCondition\tMatched (Construct+Sample Barcode)\tMatched Sample Barcode\t% Match\tNormalized Match" - ) - colReference.allBarcodes.foreach { colBarcode => - val data = perBarcodeQualityData(state, rowReference, colReference, colBarcode) - writer.println(data.mkString("\t")) - } + qualityWriter.println(s"Read counts for sample barcodes with associated conditions:") + + // use a TeeWriter for the next section of the report + val tw = new TeeWriter(qualityWriter, cbcsWriter) + tw.println( + s"Barcode\tCondition\tMatched (Construct+Sample Barcode)\tMatched Sample Barcode\t% Match\tNormalized Match" + ) + colReference.allBarcodes.foreach { colBarcode => + val data = perBarcodeQualityData(state, rowReference, colReference, colBarcode) + tw.println(data.mkString("\t")) + } - writer.println() - writer.println("Read counts for most common sample barcodes without associated conditions:") - val unepectedBarcodeFrequencies = - state.unknownCol.keys.map(barcode => BarcodeFrequency(barcode, state.unknownCol.count(barcode))).toSeq - topN(unepectedBarcodeFrequencies, 100).foreach { case BarcodeFrequency(barcode, count) => - writer.println(barcode + "\t" + count.toString) + qualityWriter.println() + qualityWriter.println("Read counts for most common sample barcodes without associated conditions:") + val unepectedBarcodeFrequencies = + state.unknownCol.keys.map(barcode => BarcodeFrequency(barcode, state.unknownCol.count(barcode))).toSeq + topN(unepectedBarcodeFrequencies, 100).foreach { case BarcodeFrequency(barcode, count) => + qualityWriter.println(barcode + "\t" + count.toString) + } + qualityWriter.println() } - writer.println() } private[this] def decOptFmt(d: Option[Double]): String = d.map(Decimal00Format.format).getOrElse("N/A") From fbbb03d9fb7a6736f3666960bd64ba93a43cae04 Mon Sep 17 00:00:00 2001 From: Mark Tomko Date: Thu, 8 Feb 2024 15:00:32 -0800 Subject: [PATCH 06/12] Update existing tests --- .../integration/UnlabeledConditionsTest.scala | 2 ++ .../integration/legacy/LegacyIntegrationTest.scala | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/src/test/scala/org/broadinstitute/gpp/poolq3/integration/UnlabeledConditionsTest.scala b/src/test/scala/org/broadinstitute/gpp/poolq3/integration/UnlabeledConditionsTest.scala index 09b9611..12c6938 100644 --- a/src/test/scala/org/broadinstitute/gpp/poolq3/integration/UnlabeledConditionsTest.scala +++ b/src/test/scala/org/broadinstitute/gpp/poolq3/integration/UnlabeledConditionsTest.scala @@ -23,6 +23,7 @@ class UnlabeledConditionsTest extends CatsEffectSuite with TestResources { barcodeCountsFile <- tempFile[IO]("barcode-counts", ".txt") normalizedCountsFile <- tempFile[IO]("normcounts", ".txt") qualityFile <- tempFile[IO]("quality", ".txt") + conditionBarcodeCountsSummaryFile <- tempFile[IO]("condition-barcode-counts-summary", ".txt") correlationFile <- tempFile[IO]("correlation", ".txt") unexpectedSequencesFile <- tempFile[IO]("unexpected", ".txt") runInfoFile <- tempFile[IO]("runinfo", ".txt") @@ -32,6 +33,7 @@ class UnlabeledConditionsTest extends CatsEffectSuite with TestResources { normalizedCountsFile = normalizedCountsFile, barcodeCountsFile = barcodeCountsFile, qualityFile = qualityFile, + conditionBarcodeCountsSummaryFile = conditionBarcodeCountsSummaryFile, correlationFile = correlationFile, unexpectedSequencesFile = unexpectedSequencesFile, runInfoFile = runInfoFile diff --git a/src/test/scala/org/broadinstitute/gpp/poolq3/integration/legacy/LegacyIntegrationTest.scala b/src/test/scala/org/broadinstitute/gpp/poolq3/integration/legacy/LegacyIntegrationTest.scala index b08e733..2333ec3 100644 --- a/src/test/scala/org/broadinstitute/gpp/poolq3/integration/legacy/LegacyIntegrationTest.scala +++ b/src/test/scala/org/broadinstitute/gpp/poolq3/integration/legacy/LegacyIntegrationTest.scala @@ -33,6 +33,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { barcodeCountsFile <- File.temporaryFile("barcode-counts", ".txt") normalizedCountsFile <- File.temporaryFile("normcounts", ".txt") qualityFile <- File.temporaryFile("quality", ".txt") + conditionBarcodeCountsSummaryFile <- File.temporaryFile("condition-barcode-counts-summary", ".txt") correlationFile <- File.temporaryFile("correlation", ".txt") unexpectedSequencesFile <- File.temporaryFile("unexpected", ".txt") unexpectedSequenceCacheDir <- File.temporaryDirectory("unexpected-cache") @@ -49,6 +50,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { normalizedCountsFile = normalizedCountsFile.toJava.toPath, barcodeCountsFile = barcodeCountsFile.toJava.toPath, qualityFile = qualityFile.toJava.toPath, + conditionBarcodeCountsSummaryFile = conditionBarcodeCountsSummaryFile.toJava.toPath, correlationFile = correlationFile.toJava.toPath, unexpectedSequencesFile = unexpectedSequencesFile.toJava.toPath, runInfoFile = runInfoFile.toJava.toPath @@ -75,6 +77,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { barcodeCountsFile <- File.temporaryFile("barcode-counts", ".txt") normalizedCountsFile <- File.temporaryFile("normcounts", ".txt") qualityFile <- File.temporaryFile("quality", ".txt") + conditionBarcodeCountsSummaryFile <- File.temporaryFile("condition-barcode-counts-summary", ".txt") correlationFile <- File.temporaryFile("correlation", ".txt") unexpectedSequencesFile <- File.temporaryFile("unexpected", ".txt") unexpectedSequenceCacheDir <- File.temporaryDirectory("unexpected-cache") @@ -91,6 +94,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { normalizedCountsFile = normalizedCountsFile.toJava.toPath, barcodeCountsFile = barcodeCountsFile.toJava.toPath, qualityFile = qualityFile.toJava.toPath, + conditionBarcodeCountsSummaryFile = conditionBarcodeCountsSummaryFile.toJava.toPath, correlationFile = correlationFile.toJava.toPath, unexpectedSequencesFile = unexpectedSequencesFile.toJava.toPath, runInfoFile = runInfoFile.toJava.toPath @@ -126,6 +130,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { normalizedCountsFile <- File.temporaryFile("normcounts", ".txt") barcodeCountsFile <- File.temporaryFile("barcode-counts", ".txt") qualityFile <- File.temporaryFile("quality", ".txt") + conditionBarcodeCountsSummaryFile <- File.temporaryFile("condition-barcode-counts-summary", ".txt") correlationFile <- File.temporaryFile("correlation", ".txt") unexpectedSequencesFile <- File.temporaryFile("unexpected", ".txt") unexpectedSequenceCacheDir <- File.temporaryDirectory("unexpected-cache") @@ -142,6 +147,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { normalizedCountsFile = normalizedCountsFile.toJava.toPath, barcodeCountsFile = barcodeCountsFile.toJava.toPath, qualityFile = qualityFile.toJava.toPath, + conditionBarcodeCountsSummaryFile = conditionBarcodeCountsSummaryFile.toJava.toPath, correlationFile = correlationFile.toJava.toPath, unexpectedSequencesFile = unexpectedSequencesFile.toJava.toPath, runInfoFile = runInfoFile.toJava.toPath @@ -170,6 +176,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { normalizedCountsFile <- File.temporaryFile("normcounts", ".txt") barcodeCountsFile <- File.temporaryFile("barcode-counts", ".txt") qualityFile <- File.temporaryFile("quality", ".txt") + conditionBarcodeCountsSummaryFile <- File.temporaryFile("condition-barcode-counts-summary", ".txt") correlationFile <- File.temporaryFile("correlation", ".txt") unexpectedSequencesFile <- File.temporaryFile("unexpected", ".txt") unexpectedSequenceCacheDir <- File.temporaryDirectory("unexpected-cache") @@ -186,6 +193,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { normalizedCountsFile = normalizedCountsFile.toJava.toPath, barcodeCountsFile = barcodeCountsFile.toJava.toPath, qualityFile = qualityFile.toJava.toPath, + conditionBarcodeCountsSummaryFile = conditionBarcodeCountsSummaryFile.toJava.toPath, correlationFile = correlationFile.toJava.toPath, unexpectedSequencesFile = unexpectedSequencesFile.toJava.toPath, runInfoFile = runInfoFile.toJava.toPath @@ -215,6 +223,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { normalizedCountsFile <- File.temporaryFile("normcounts", ".txt") barcodeCountsFile <- File.temporaryFile("barcode-counts", ".txt") qualityFile <- File.temporaryFile("quality", ".txt") + conditionBarcodeCountsSummaryFile <- File.temporaryFile("condition-barcode-counts-summary", ".txt") correlationFile <- File.temporaryFile("correlation", ".txt") unexpectedSequencesFile <- File.temporaryFile("unexpected", ".txt") unexpectedSequenceCacheDir <- File.temporaryDirectory("unexpected-cache") @@ -232,6 +241,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { normalizedCountsFile = normalizedCountsFile.toJava.toPath, barcodeCountsFile = barcodeCountsFile.toJava.toPath, qualityFile = qualityFile.toJava.toPath, + conditionBarcodeCountsSummaryFile = conditionBarcodeCountsSummaryFile.toJava.toPath, correlationFile = correlationFile.toJava.toPath, unexpectedSequencesFile = unexpectedSequencesFile.toJava.toPath, runInfoFile = runInfoFile.toJava.toPath @@ -255,6 +265,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { normalizedCountsFile <- File.temporaryFile("normcounts", ".txt") barcodeCountsFile <- File.temporaryFile("barcode-counts", ".txt") qualityFile <- File.temporaryFile("quality", ".txt") + conditionBarcodeCountsSummaryFile <- File.temporaryFile("condition-barcode-counts-summary", ".txt") correlationFile <- File.temporaryFile("correlation", ".txt") unexpectedSequencesFile <- File.temporaryFile("unexpected", ".txt") unexpectedSequenceCacheDir <- File.temporaryDirectory("unexpected-cache") @@ -272,6 +283,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { normalizedCountsFile = normalizedCountsFile.toJava.toPath, barcodeCountsFile = barcodeCountsFile.toJava.toPath, qualityFile = qualityFile.toJava.toPath, + conditionBarcodeCountsSummaryFile = conditionBarcodeCountsSummaryFile.toJava.toPath, correlationFile = correlationFile.toJava.toPath, unexpectedSequencesFile = unexpectedSequencesFile.toJava.toPath, runInfoFile = runInfoFile.toJava.toPath @@ -299,6 +311,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { normalizedCountsFile <- File.temporaryFile("normcounts", ".txt") barcodeCountsFile <- File.temporaryFile("barcode-counts", ".txt") qualityFile <- File.temporaryFile("quality", ".txt") + conditionBarcodeCountsSummaryFile <- File.temporaryFile("condition-barcode-counts-summary", ".txt") correlationFile <- File.temporaryFile("correlation", ".txt") unexpectedSequencesFile <- File.temporaryFile("unexpected", ".txt") unexpectedSequenceCacheDir <- File.temporaryDirectory("unexpected-cache") @@ -315,6 +328,7 @@ class LegacyIntegrationTest extends AnyFlatSpec with TestResources { normalizedCountsFile = normalizedCountsFile.toJava.toPath, barcodeCountsFile = barcodeCountsFile.toJava.toPath, qualityFile = qualityFile.toJava.toPath, + conditionBarcodeCountsSummaryFile = conditionBarcodeCountsSummaryFile.toJava.toPath, correlationFile = correlationFile.toJava.toPath, unexpectedSequencesFile = unexpectedSequencesFile.toJava.toPath, runInfoFile = runInfoFile.toJava.toPath From 51500e76e7154dded6c9e54953a6114c54b8adeb Mon Sep 17 00:00:00 2001 From: Mark Tomko Date: Thu, 8 Feb 2024 15:00:39 -0800 Subject: [PATCH 07/12] Test for summary writer --- .../ConditionBarcodeCountsSummaryTest.scala | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 src/test/scala/org/broadinstitute/gpp/poolq3/reports/ConditionBarcodeCountsSummaryTest.scala diff --git a/src/test/scala/org/broadinstitute/gpp/poolq3/reports/ConditionBarcodeCountsSummaryTest.scala b/src/test/scala/org/broadinstitute/gpp/poolq3/reports/ConditionBarcodeCountsSummaryTest.scala new file mode 100644 index 0000000..b0e4340 --- /dev/null +++ b/src/test/scala/org/broadinstitute/gpp/poolq3/reports/ConditionBarcodeCountsSummaryTest.scala @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2022 The Broad Institute, Inc. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.broadinstitute.gpp.poolq3.reports + +import cats.effect.IO +import fs2.io.file.Files +import munit.CatsEffectSuite +import org.broadinstitute.gpp.poolq3.hist.{BasicShardedHistogram, OpenHashMapHistogram, TupleHistogram} +import org.broadinstitute.gpp.poolq3.parser.ReferenceEntry +import org.broadinstitute.gpp.poolq3.process.State +import org.broadinstitute.gpp.poolq3.reference.ExactReference + +class ConditionBarcodeCountsSummaryTest extends CatsEffectSuite { + + private val Condition1 = "DMSO" + private val Condition2 = "ITMFA" + private val Condition3 = "No Drug" + private val SampleBarcode1 = "GTAT" + private val SampleBarcode2 = "ACAT" + private val SampleBarcode3 = "TCAG" + private val SampleBarcode4 = "TCCG" + + // we only need 1 construct to populate the report + private val Construct1 = "AACCGGTTAACCGGTTTTAAG" + private val ConstructId1 = "BRDN01" + + private val Constructs = List(ReferenceEntry(Construct1, ConstructId1)) + + private val rowReference = ExactReference(Constructs, identity, includeAmbiguous = false) + + private val colReference = + ExactReference( + List( + ReferenceEntry(SampleBarcode1, Condition1), + ReferenceEntry(SampleBarcode2, Condition2), + ReferenceEntry(SampleBarcode3, Condition3), + ReferenceEntry(SampleBarcode4, Condition3) + ), + identity, + includeAmbiguous = false + ) + + def emptyState(): State = + new State( + new BasicShardedHistogram[String, (String, String)](new TupleHistogram()), + new OpenHashMapHistogram(), + new OpenHashMapHistogram(), + new OpenHashMapHistogram() + ) + + test("condition barcode counts summary") { + val sample1MatchesBoth = 10 + val sample2MatchesBoth = 7 + val sample3MatchesBoth = 28 + val sample4MatchesBoth = 3 + + val sample1MatchesCol = sample1MatchesBoth + 8 + val sample2MatchesCol = sample2MatchesBoth + 3 + val sample3MatchesCol = sample3MatchesBoth + 17 + val sample4MatchesCol = sample4MatchesBoth + 11 + + Files[IO].tempDirectory.use { tmpDir => + val cbcs = tmpDir / "cbcs.txt" + + // fill out state + val state = emptyState() + 0.until(sample1MatchesBoth).foreach(_ => state.known.increment(None, (Construct1, SampleBarcode1))) + 0.until(sample2MatchesBoth).foreach(_ => state.known.increment(None, (Construct1, SampleBarcode2))) + 0.until(sample3MatchesBoth).foreach(_ => state.known.increment(None, (Construct1, SampleBarcode3))) + 0.until(sample4MatchesBoth).foreach(_ => state.known.increment(None, (Construct1, SampleBarcode4))) + + 0.until(sample1MatchesCol).foreach(_ => state.knownCol.increment(SampleBarcode1)) + 0.until(sample2MatchesCol).foreach(_ => state.knownCol.increment(SampleBarcode2)) + 0.until(sample3MatchesCol).foreach(_ => state.knownCol.increment(SampleBarcode3)) + 0.until(sample4MatchesCol).foreach(_ => state.knownCol.increment(SampleBarcode4)) + + state.reads = sample1MatchesCol + sample2MatchesCol + sample3MatchesCol + sample4MatchesCol + 5 + + IO.blocking { + QualityWriter + .write((tmpDir / "quality.txt").toNioPath, cbcs.toNioPath, state, rowReference, colReference, false) + .get + } >> + Files[IO] + .readUtf8(cbcs) + .compile + .lastOrError + .assertEquals( + """Barcode Condition Matched (Construct+Sample Barcode) Matched Sample Barcode % Match Normalized Match + |GTAT DMSO 10 18 55.56 16.730 + |ACAT ITMFA 7 10 70.00 16.215 + |TCAG No Drug 28 45 62.22 18.215 + |TCCG No Drug 3 14 21.43 14.993 + |""".stripMargin + ) + + } + + } + +} From 66e28a5c506752aa6b5e7e38298ecdc54a188005 Mon Sep 17 00:00:00 2001 From: Mark Tomko Date: Thu, 8 Feb 2024 15:03:41 -0800 Subject: [PATCH 08/12] Set version to 3.10.0-SNAPSHOT --- version.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.sbt b/version.sbt index ba7d8c0..129758e 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -ThisBuild / version := "3.9.1-SNAPSHOT" +ThisBuild / version := "3.10.0-SNAPSHOT" From 3124a44badcae5e1099dc72c32e9c7f3d283120a Mon Sep 17 00:00:00 2001 From: Mark Tomko Date: Thu, 8 Feb 2024 15:06:29 -0800 Subject: [PATCH 09/12] Add file to command line parser --- .../scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala b/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala index e3491ac..b32ec39 100644 --- a/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala +++ b/src/main/scala/org/broadinstitute/gpp/poolq3/PoolQConfig.scala @@ -254,6 +254,11 @@ object PoolQConfig { val _ = opt[Path]("quality").valueName("").action((f, c) => c.copy(output = c.output.copy(qualityFile = f))) + val _ = + opt[Path]("condition-barcode-counts-summary") + .valueName("") + .action((f, c) => c.copy(output = c.output.copy(conditionBarcodeCountsSummaryFile = f))) + val _ = opt[Path]("counts").valueName("").action((f, c) => c.copy(output = c.output.copy(countsFile = f))) val _ = opt[Path]("normalized-counts").valueName("").action { (f, c) => From 0ddc465f201dfa292e053fb2a207d7a7e90864be Mon Sep 17 00:00:00 2001 From: Mark Tomko Date: Thu, 8 Feb 2024 15:06:59 -0800 Subject: [PATCH 10/12] Update readme and manual --- README.md | 24 ++++++++++++------------ docs/MANUAL.md | 8 +++++--- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index fbb14ea..14365ae 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # PoolQ 3.0 -Copyright (c) 2022 Genetic Perturbation Platform, The Broad Institute of Harvard and MIT. +Copyright (c) 2024 Genetic Perturbation Platform, The Broad Institute of Harvard and MIT. [![Build Status](https://github.com/broadinstitute/poolq/actions/workflows/ci.yml/badge.svg)](https://github.com/broadinstitute/poolq/actions/workflows/ci.yml) @@ -8,9 +8,9 @@ Copyright (c) 2022 Genetic Perturbation Platform, The Broad Institute of Harvard PoolQ is a counter for indexed samples from next-generation sequencing of pooled DNA. Given a set of sequencing data files (FASTQ, SAM, or BAM), and a pair of reference files mapping DNA barcodes -to construct or experimental identifiers, PoolQ reads the sequencing data and tallies the +to construct or experimental identifiers, PoolQ reads the sequencing data and tallies the co-occurrence of each pair of barcodes from the two files, yielding a two-dimensional histogram. -The barcodes in one reference file are treated as rows in the histogram; the other correspond to +The barcodes in one reference file are treated as rows in the histogram; the other correspond to columns. PoolQ is capable of locating barcodes within reads using a variety of techniques: @@ -22,16 +22,16 @@ It matches barcodes to reference data either exactly or allowing up to one base PoolQ does not support matching with gaps or deletions. In addition to producing a histogram, PoolQ generates a number of reports, which contain statistics and -other information that can be used to troubleshoot experiments. These include match percentages, barcode +other information that can be used to troubleshoot experiments. These include match percentages, barcode locations, matching correlations between barcodes, and lists of frequently-occurring unknown barcodes. ## Documentation -For information on how to run PoolQ and its various modes and options, please see the +For information on how to run PoolQ and its various modes and options, please see the [manual](docs/MANUAL.md). We also maintain a [changelog](CHANGELOG.md) listing updates made to PoolQ. -As of version 3.5.0, the source code to PoolQ is available under a [BSD 3-clause license](LICENSE). We +As of version 3.5.0, the source code to PoolQ is available under a [BSD 3-clause license](LICENSE). We welcome contributions to PoolQ and have created a [contributor guide](CONTRIBUTING.md). Additionally, -we maintain a [list](NOTICE.txt) of other open-source libraries PoolQ depends on, along with links to +we maintain a [list](NOTICE.txt) of other open-source libraries PoolQ depends on, along with links to associated licenses. ## Changes in PoolQ 3 @@ -40,7 +40,7 @@ PoolQ was completely rewritten for version 3. The new code is faster and the cod and more maintainable. We have taken the opportunity to make other changes to PoolQ as well. * There are substantial changes to the command-line interface for the program. -* The default counts file format has changed slightly, although there is a command-line +* The default counts file format has changed slightly, although there is a command-line argument that indicates that PoolQ 3 should write a backwards-compatible counts file. The differences are in headers only; file parsers should be able to adapt easily. * The quality file has changed somewhat. Importantly, the definition of certain statistics has changed @@ -51,14 +51,14 @@ See the [manual](docs/MANUAL.md) for complete details on the differences version ## PoolQ 2 support -We will continue to make the PoolQ 2.4 artifacts available for download on the +We will continue to make the PoolQ 2.4 artifacts available for download on the [GPP portal](https://portals.broadinstitute.org/gpp/public/software/poolq). We have no plans to add -features to the code. We will address bugs on a case-by-case basis; in general only critical +features to the code. We will address bugs on a case-by-case basis; in general only critical bugfixes will be ported to versions prior to 2.4, effective immediately. -## Maintainers +## Maintainers -PoolQ was originally developed by John Sullivan and Shuba Gopal of the Broad Institute RNAi Platform. It +PoolQ was originally developed by John Sullivan and Shuba Gopal of the Broad Institute RNAi Platform. It is maintained by Mark Tomko of the Broad Institute Genetic Perturbation Platform. ## Contact Us diff --git a/docs/MANUAL.md b/docs/MANUAL.md index da53a9b..954dba9 100644 --- a/docs/MANUAL.md +++ b/docs/MANUAL.md @@ -2,7 +2,7 @@ PoolQ is a counter for indexed samples from next-gen sequencing of pooled DNA. -_This documentation covers PoolQ version 3.7.0 (last updated 09/05/2023)._ +_This documentation covers PoolQ version 3.10.0 (last updated 02/08/2024)._ ## Background @@ -559,7 +559,7 @@ PoolQ you will need a Java 8 JDK. You can download an appropriate JRE or JDK fro You can download PoolQ from an as yet undetermined location. The file you download is a ZIP file that you will need to unzip. In most cases, this is as simple as right-clicking on the zip file, and selecting something like "extract contents" from the popup menu. This will create a new folder on -your computer named `poolq-3.7.0`, with the following contents: +your computer named `poolq-3.10.0`, with the following contents: - `poolq3.jar` - `poolq3.bat` @@ -627,7 +627,7 @@ how to launch programs from the command line on your given operating system. If you successfully launched PoolQ, you should see a usage message explaining all of the command-line options: - poolq3 3.7.0 + poolq3 3.10.0 Usage: poolq [options] --row-reference reference file for row barcodes (i.e., constructs) @@ -652,6 +652,7 @@ command-line options: --umi-counts-dir --umi-barcode-counts-dir --quality + --condition-barcode-counts-summary --counts --normalized-counts --barcode-counts @@ -661,6 +662,7 @@ command-line options: --correlation --run-info --unexpected-sequence-threshold + --unexpected-sequence-sample-pct --unexpected-sequences --umi-quality --unexpected-sequence-cache From af63028845d950b36153bc58c2661397409d897f Mon Sep 17 00:00:00 2001 From: Mark Tomko Date: Thu, 8 Feb 2024 15:07:04 -0800 Subject: [PATCH 11/12] Update changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8357a89..04594c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## 3.10.0 +* Machine-parseable condition barcode summary file + ## 3.9.0 * Use sampling technique for generating unexpected sequence reports From fdb907b2a81cc609ae8b02777a14bc4944353602 Mon Sep 17 00:00:00 2001 From: Mark Tomko Date: Thu, 8 Feb 2024 21:56:18 -0800 Subject: [PATCH 12/12] Bugfix to makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index aa004d7..3c52347 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -fullversion := $(shell grep -m 1 'ThisBuild / version :=' ./version.sbt | perl -pe 's/^ThisBuild \/ version := "([0-9]+\.[0-9+]\.[0-9]+).*$$/$$1/g') +fullversion := $(shell grep -m 1 'ThisBuild / version :=' ./version.sbt | perl -pe 's/^ThisBuild \/ version := "([0-9]+\.[0-9]+\.[0-9]+).*$$/$$1/g') version := $(shell grep -m 1 'ThisBuild / version :=' ./version.sbt | perl -pe 's/^ThisBuild \/ version := "([0-9]+\.[0-9+]).*$$/$$1/g')