From b46b5b47d4ee847617102c3144f24609c659b313 Mon Sep 17 00:00:00 2001 From: Paul Butcher Date: Fri, 5 Apr 2024 11:50:06 +0100 Subject: [PATCH] add source-independent contributor tests --- .../transformers/MarcContributors.scala | 4 +- .../transformers/MarcContributorsTest.scala | 402 ++++++++++++++++++ 2 files changed, 404 insertions(+), 2 deletions(-) create mode 100644 pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributorsTest.scala diff --git a/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributors.scala b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributors.scala index 05ee1c0621..2b0c9cb9a5 100644 --- a/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributors.scala +++ b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributors.scala @@ -42,7 +42,7 @@ object MarcContributors val primaries = record .fieldsWithTags("100", "110", "111") val secondaries = record.fieldsWithTags("700", "710", "711") - filterSecondaryDuplicates( + filterDuplicates( (primaries ++ secondaries) .flatMap(field => singleContributor(field)) ).toList.harmoniseOntologyTypes @@ -59,7 +59,7 @@ object MarcContributors * * We do not want this duplication in the output. */ - private def filterSecondaryDuplicates(allContributors: Output): Output = { + private def filterDuplicates(allContributors: Output): Output = { val duplicatedContributors = allContributors .filter(_.primary == false) diff --git a/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributorsTest.scala b/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributorsTest.scala new file mode 100644 index 0000000000..a2274ce710 --- /dev/null +++ b/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributorsTest.scala @@ -0,0 +1,402 @@ +package weco.pipeline.transformer.marc_common.transformers + +import org.scalatest.Inspectors +import org.scalatest.funspec.AnyFunSpec +import org.scalatest.matchers.should.Matchers +import weco.catalogue.internal_model.work.{Agent, Person} +import weco.pipeline.transformer.marc_common.generators.MarcTestRecord +import weco.pipeline.transformer.marc_common.logging.LoggingContext +import weco.pipeline.transformer.marc_common.models.{MarcField, MarcSubfield} + +class MarcContributorsTest extends AnyFunSpec with Matchers with Inspectors { + + private implicit val ctx: LoggingContext = LoggingContext("") + describe( + "extracting contributors from Main Entry (1xx) and Added Entry (7xx) fields" + ) { + info("https://www.loc.gov/marc/bibliographic/bd1xx.html") + info("https://www.loc.gov/marc/bibliographic/bd70x75x.html") + describe("When there are no contributors") { + it("returns nothing if no relevant fields are present") { + MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "245", + subfields = Seq(MarcSubfield(tag = "a", content = "The Title")) + ) + ) + ) + ) shouldBe Nil + } + + it("returns nothing if the only relevant fields are invalid") { + MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "110", + subfields = Nil + ), + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = " "), + MarcSubfield(tag = "b", content = ""), + MarcSubfield(tag = "c", content = " ") + ) + ) + ) + ) + ) shouldBe Nil + } + + } + + describe("When there are contributors to transform") { + it("returns a mix of primary and non-primary contributors") { + val contributors = MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "100", + subfields = Seq(MarcSubfield(tag = "a", content = "Euripedes")) + ), + MarcField( + marcTag = "110", + subfields = Seq( + MarcSubfield(tag = "a", content = "Felpersham University") + ) + ), + MarcField( + marcTag = "111", + subfields = + Seq(MarcSubfield(tag = "a", content = "Council of Elrond")) + ), + MarcField( + marcTag = "700", + subfields = + Seq(MarcSubfield(tag = "a", content = "Anil Mendem")) + ), + MarcField( + marcTag = "710", + subfields = Seq( + MarcSubfield(tag = "a", content = "University of Inverdoon") + ) + ), + MarcField( + marcTag = "711", + subfields = Seq( + MarcSubfield( + tag = "a", + content = "Xenophon's Symposium" + ) + ) + ) + ) + ) + ) + contributors.map(_.primary) should contain theSameElementsAs Seq( + true, true, true, false, false, false + ) + contributors.map(_.agent.label) should contain theSameElementsAs Seq( + "Euripedes", + "Felpersham University", + "Council of Elrond", + "Anil Mendem", + "University of Inverdoon", + "Xenophon's Symposium" + ) + } + it( + "returns primary contributors first, but otherwise respects document order" + ) { + val contributors = MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "711", + subfields = Seq( + MarcSubfield( + tag = "a", + content = "Xenophon's Symposium" + ) + ) + ), + MarcField( + marcTag = "700", + subfields = + Seq(MarcSubfield(tag = "a", content = "Anil Mendem")) + ), + MarcField( + marcTag = "100", + subfields = Seq(MarcSubfield(tag = "a", content = "Euripedes")) + ), + MarcField( + marcTag = "710", + subfields = Seq( + MarcSubfield(tag = "a", content = "University of Inverdoon") + ) + ), + MarcField( + marcTag = "110", + subfields = Seq( + MarcSubfield(tag = "a", content = "Felpersham University") + ) + ), + MarcField( + marcTag = "111", + subfields = + Seq(MarcSubfield(tag = "a", content = "Council of Elrond")) + ) + ) + ) + ) + contributors.map(_.primary) should contain theSameElementsAs Seq( + true, true, true, false, false, false + ) + contributors.map(_.agent.label) should contain theSameElementsAs Seq( + "Euripedes", + "Felpersham University", + "Council of Elrond", + "Xenophon's Symposium", + "Anil Mendem", + "University of Inverdoon" + ) + } + it( + "returns an Agent contributor if a Person field contains subfield t: 'Title of a work'" + ) { + info( + "This is a historic quirk, it may not need to be this way, and it's probably incorrect" + ) + val contributors = MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði"), + MarcSubfield(tag = "t", content = "Codex Regius") + ) + ), + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson"), + MarcSubfield(tag = "t", content = "Gylfaginning") + ) + ) + ) + ) + ) + contributors.head.agent should be(an[Agent[_]]) + contributors(1).agent should be(an[Agent[_]]) + } + + it("harmonises Agent entries with Person entries if they match") { + info("The first two fields generate Agents due to the t subfield") + info( + "However, because fields with the same ID later generate a Person" + ) + info("the ontologytype harmonisation converts them both to a Person") + val contributors = MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði"), + MarcSubfield(tag = "t", content = "Codex Regius"), + MarcSubfield(tag = "0", content = "n97018003") + ) + ), + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson"), + MarcSubfield(tag = "t", content = "Gylfaginning"), + MarcSubfield(tag = "0", content = "n50000553") + ) + ), + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði"), + MarcSubfield(tag = "0", content = "n97018003") + ) + ), + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson"), + MarcSubfield(tag = "0", content = "n50000553") + ) + ) + ) + ) + ) + forAll(contributors) { + contributor => + contributor.agent should be(a[Person[_]]) + } + } + describe("deduplication") { + + it( + "returns only the primary contributor if that contributor is duplicated as an Added Entry" + ) { + val contributors = MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði"), + MarcSubfield(tag = "0", content = "n97018003") + ) + ), + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson"), + MarcSubfield(tag = "0", content = "n50000553") + ) + ), + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði"), + MarcSubfield(tag = "0", content = "n97018003") + ) + ), + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson"), + MarcSubfield(tag = "0", content = "n50000553") + ) + ) + ) + ) + ) + contributors should have length 2 + contributors.map(_.agent.label) should contain theSameElementsAs Seq( + "Sæmundr fróði", + "Snorri Sturluson" + ) + forAll(contributors) { + contributor => contributor.primary shouldBe true + } + } + + it( + "deduplicates by matching on the whole contributor" + ) { + info("It is only the difference in primary status that is ignored") + info("when finding and eliminating duplicate contributors") + + val fields = Seq( + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Max Bialystock"), + MarcSubfield(tag = "e", content = "producer") + ) + ), + MarcField( + // differs by presence of id + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Max Bialystock"), + MarcSubfield(tag = "0", content = "no2021024802"), + MarcSubfield(tag = "e", content = "producer") + ) + ), + MarcField( + // is the same + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Max Bialystock"), + MarcSubfield(tag = "e", content = "producer") + ) + ), + MarcField( + // differs by having a different role + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Max Bialystock"), + MarcSubfield(tag = "e", content = "director") + ) + ), + MarcField( + // differs by having no role + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Max Bialystock") + ) + ), + MarcField( + // is the same + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Max Bialystock"), + MarcSubfield(tag = "e", content = "producer") + ) + ) + ) + + val contributors = MarcContributors(MarcTestRecord(fields = fields)) + contributors should have length 4 + // not really an assertion, just to clarify that the two + // 700s that are otherwise identical to the 100 are discarded + fields should have length 6 + } + + it( + "Also removes fully identical duplicates" + ) { + val Seq(primary, secondary) = MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði") + ) + ), + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði") + ) + ), + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson") + ) + ), + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson") + ) + ) + ) + ) + ) + + primary.primary shouldBe true + primary.agent.label shouldBe "Sæmundr fróði" + + secondary.primary shouldBe false + secondary.agent.label shouldBe "Snorri Sturluson" + + } + } + } + + } + +}