diff --git a/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/transformers/OntologyTypeOps.scala b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/OntologyTypeOps.scala similarity index 99% rename from pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/transformers/OntologyTypeOps.scala rename to pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/OntologyTypeOps.scala index 217d1dedf4..4d4352ce45 100644 --- a/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/transformers/OntologyTypeOps.scala +++ b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/OntologyTypeOps.scala @@ -1,4 +1,4 @@ -package weco.pipeline.transformer.sierra.transformers +package weco.pipeline.transformer.marc_common import grizzled.slf4j.Logging import weco.catalogue.internal_model.identifiers.{IdState, SourceIdentifier} diff --git a/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcAbstractAgent.scala b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcAbstractAgent.scala new file mode 100644 index 0000000000..81ac8210b4 --- /dev/null +++ b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcAbstractAgent.scala @@ -0,0 +1,123 @@ +package weco.pipeline.transformer.marc_common.transformers + +import weco.catalogue.internal_model.identifiers.{ + IdState, + IdentifierType, + SourceIdentifier +} +import weco.catalogue.internal_model.work.AbstractAgent +import weco.pipeline.transformer.identifiers.LabelDerivedIdentifiers +import weco.pipeline.transformer.marc_common.models.{MarcField, MarcSubfield} +import weco.pipeline.transformer.text.TextNormalisation.TextNormalisationOps + +import scala.util.{Failure, Success, Try} + +trait MarcAbstractAgent extends LabelDerivedIdentifiers { + type Output = Try[AbstractAgent[IdState.Unminted]] + + protected val ontologyType: String + protected val appropriateFields: Seq[String] + protected val labelSubfieldTags: Seq[String] + protected def createAgent( + label: String, + identifier: IdState.Unminted + ): AbstractAgent[IdState.Unminted] + + /** Construct a label from the subfields representing an agent. + */ + protected def getLabel(field: MarcField): Option[String] = { + val contents = + field.subfields + .filter { + s => labelSubfieldTags.contains(s.tag) + } + .filterNot { _.content.trim.isEmpty } + .map { _.content } + + contents match { + case Nil => None + case nonEmptyList => Some(nonEmptyList mkString " ") + } + } + + protected def normaliseLabel(label: String): String = label.trimTrailing(',') + + /* Given an agent and the associated MARC subfields, look for instances of subfield $0, + * which are used for identifiers. + * TODO: UPDATE THIS COMMENT + * This methods them (if present) and wraps the agent in Unidentifiable or Identifiable + * as appropriate. + */ + // TODO: Consider if this might be a trait in its own right. + // Does it apply to things that are not agents? + // Yes! + protected def getIdentifier( + subfields: Seq[MarcSubfield], + label: String + ): IdState.Unminted = { + + // We take the contents of subfield $0. They may contain inconsistent + // spacing and punctuation, such as: + // + // " nr 82270463" + // "nr 82270463" + // "nr 82270463.," + // + // which all refer to the same identifier. + // + // For consistency, we remove all whitespace and some punctuation + // before continuing. + val codes = subfields.collect { + case MarcSubfield("0", content) => + content.replaceAll("[.,\\s]", "") + } + + // If we get exactly one value, we can use it to identify the record. + // Some records have multiple instances of subfield $0 (it's a repeatable + // field in the MARC spec). + codes.distinct match { + case Seq(code) => + IdState.Identifiable( + SourceIdentifier( + identifierType = IdentifierType.LCNames, + value = code, + ontologyType = ontologyType + ) + ) + case _ => identifierFromText(label = label, ontologyType = ontologyType) + } + } + private def isAppropriateField(field: MarcField): Boolean = + appropriateFields.contains(field.marcTag) + + def apply( + field: MarcField + ): Try[AbstractAgent[IdState.Unminted]] = { + if (isAppropriateField(field)) { + getLabel(field) match { + case Some(label) => + Success( + createAgent( + label, + getIdentifier( + subfields = field.subfields, + label = label + ) + ) + ) + case None => + Failure( + new Exception( + s"no label found when transforming $field into an $ontologyType" + ) + ) + } + } else { + Failure( + new Exception( + s"attempt to transform incompatible MARC field ${field.marcTag} into $ontologyType" + ) + ) + } + } +} diff --git a/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcAgent.scala b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcAgent.scala new file mode 100644 index 0000000000..c4ef9f5b1d --- /dev/null +++ b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcAgent.scala @@ -0,0 +1,28 @@ +package weco.pipeline.transformer.marc_common.transformers + +import weco.catalogue.internal_model.identifiers.IdState +import weco.catalogue.internal_model.work.{AbstractAgent, Agent} + +object MarcAgent extends MarcFieldTransformer with MarcAbstractAgent { + + override protected val ontologyType: String = "Agent" + override protected val appropriateFields: Seq[String] = + Seq("100", "600", "700") + override protected val labelSubfieldTags: Seq[String] = Seq( + "a", + "b", + "c", + "d", + "t", + "n", + "p", + "q", + "l" + ) + + override protected def createAgent( + label: String, + identifier: IdState.Unminted + ): AbstractAgent[IdState.Unminted] = + new Agent(label = label, id = identifier) +} diff --git a/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributionRoles.scala b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributionRoles.scala new file mode 100644 index 0000000000..dfb90d2618 --- /dev/null +++ b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributionRoles.scala @@ -0,0 +1,28 @@ +package weco.pipeline.transformer.marc_common.transformers + +import weco.catalogue.internal_model.work.ContributionRole +import weco.pipeline.transformer.marc_common.models.MarcField + +object MarcContributionRoles extends MarcFieldTransformer { + type Output = Seq[ContributionRole] + private def roleSubfieldCodes(marcTag: String): Seq[String] = + marcTag.substring(1) match { + case "00" => Seq("e", "j") + case "10" => Seq("e") + case "11" => Seq("j") + } + + override def apply(field: MarcField): Seq[ContributionRole] = + field.subfields + .filter( + subfield => roleSubfieldCodes(field.marcTag).contains(subfield.tag) + ) + .map(_.content) + // The contribution role in the raw MARC data sometimes includes a + // trailing full stop, because all the subfields are meant to be concatenated + // into a single sentence. + // + // This full stop doesn't make sense in a structured field, so remove it. + .map(_.stripSuffix(".")) + .map(ContributionRole) +} diff --git a/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributors.scala b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributors.scala new file mode 100644 index 0000000000..2b0c9cb9a5 --- /dev/null +++ b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributors.scala @@ -0,0 +1,102 @@ +package weco.pipeline.transformer.marc_common.transformers + +import grizzled.slf4j.Logging +import weco.catalogue.internal_model.identifiers.IdState +import weco.catalogue.internal_model.work.Contributor + +import weco.pipeline.transformer.marc_common.logging.LoggingContext +import weco.pipeline.transformer.marc_common.models.{MarcField, MarcRecord} + +import scala.util.{Failure, Success} + +/* Populate wwork:contributors. Rules: + * + * A contributor may be primary (1xx) or not (7xx) + * A contributor may be a person (x00), organisation (x10) or meeting (x11) + * + * For Persons and Organisations, subfield $e is used for the labels in "roles". + * + * Note: for MARC tag 700, we want to type as "Agent" rather than "Person" + * if there's a subfield "t", as this may indicate something more specific. + * e.g. some MARC records have "Hamlet", the fictional character as a 700 entry. + * We'll add a more specific type later, but "Person" isn't appropriate. + * + * Order by MARC tag (100, 110, 700, 710), then by order of appearance + * in the MARC data. + * + * https://www.loc.gov/marc/bibliographic/bd100.html + * https://www.loc.gov/marc/bibliographic/bd110.html + * https://www.loc.gov/marc/bibliographic/bd700.html + * https://www.loc.gov/marc/bibliographic/bd710.html + * + */ + +object MarcContributors + extends MarcDataTransformerWithLoggingContext + with Logging { + import weco.pipeline.transformer.marc_common.OntologyTypeOps._ + + type Output = Seq[Contributor[IdState.Unminted]] + + def apply(record: MarcRecord)(implicit ctx: LoggingContext): Output = { + val primaries = record + .fieldsWithTags("100", "110", "111") + val secondaries = record.fieldsWithTags("700", "710", "711") + filterDuplicates( + (primaries ++ secondaries) + .flatMap(field => singleContributor(field)) + ).toList.harmoniseOntologyTypes + } + private def isPrimary(marcTag: String): Boolean = + marcTag.startsWith("1") + + /** Remove non-primary contributors who are also present as primary + * contributors + * + * It is possible that the input MARC may have some contributors who are + * mentioned as both primary (1xx - Main Entry...) and non-primary (7xx - + * Added Entry...) contributors. + * + * We do not want this duplication in the output. + */ + private def filterDuplicates(allContributors: Output): Output = { + val duplicatedContributors = + allContributors + .filter(_.primary == false) + .filter(c => allContributors.contains(c.copy(primary = true))) + .toSet + + allContributors + .filterNot(c => duplicatedContributors.contains(c)) + } + private def singleContributor( + field: MarcField + )(implicit ctx: LoggingContext): Option[Contributor[IdState.Unminted]] = { + (field.marcTag.substring(1) match { + // Some "Person" entries cannot be reliably determined to be an actual + // for-real-life Person, so we make them Agents + // This is a weird Sierra-specific hack and I don't think it + // should exist, so I'm not going to put in all the effort to + // extract this specific little bit of behaviour into a Sierra-specific + // transformer + case "00" if field.subfields.exists(_.tag == "t") => MarcAgent(field) + case "00" => MarcPerson(field) + case "10" => MarcOrganisation(field) + case "11" => MarcMeeting(field) + }) match { + case Failure(exception) => + // Log and ignore. A broken Agent contributor is not + // worth throwing out the whole record + error(ctx(exception.getMessage)) + None + case Success(agent) => + Some( + Contributor( + agent = agent, + roles = MarcContributionRoles(field).toList, + primary = isPrimary(field.marcTag) + ) + ) + } + } +} diff --git a/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcDataTransformer.scala b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcDataTransformer.scala index 76f47e1aa7..d356672516 100644 --- a/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcDataTransformer.scala +++ b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcDataTransformer.scala @@ -1,8 +1,14 @@ package weco.pipeline.transformer.marc_common.transformers import weco.pipeline.transformer.marc_common.logging.LoggingContext +import weco.pipeline.transformer.marc_common.models.{MarcField, MarcRecord} import weco.pipeline.transformer.marc_common.models.MarcRecord +/* + * A MarcDataTransformer finds the appropriate field(s) within a + * MarcRecord, and transforms them into the target output. + * + * */ trait MarcDataTransformer { type Output @@ -16,3 +22,21 @@ trait MarcDataTransformerWithLoggingContext { implicit ctx: LoggingContext ): Output } + +/* + * A MarcFieldTransformer transforms a given MarcField into the target + * output. + * + * This allows for fields that that may have subtly different final outputs + * (as governed by a MarcDataTransformer) depending on which field it is. + * + * For example, an Organisation or a Person may be a subject or a contributor. + * The FieldTransformer can generate a Person, regardless of which it is + * while the corresponding DataTransformer sets the Subjectness or Contributorness + * of it. + * */ +trait MarcFieldTransformer { + type Output + + def apply(field: MarcField): Output +} diff --git a/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcMeeting.scala b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcMeeting.scala new file mode 100644 index 0000000000..0e139d6574 --- /dev/null +++ b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcMeeting.scala @@ -0,0 +1,24 @@ +package weco.pipeline.transformer.marc_common.transformers + +import weco.catalogue.internal_model.identifiers.IdState +import weco.catalogue.internal_model.work.{AbstractAgent, Meeting} + +object MarcMeeting extends MarcFieldTransformer with MarcAbstractAgent { + + override protected val ontologyType: String = "Meeting" + override protected val appropriateFields: Seq[String] = + Seq("111", "611", "711") + override protected val labelSubfieldTags: Seq[String] = Seq( + "a", + "c", + "d", + "t" + ) + + override protected def createAgent( + label: String, + identifier: IdState.Unminted + ): AbstractAgent[IdState.Unminted] = + new Meeting(label = normaliseLabel(label), id = identifier) + +} diff --git a/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcOrganisation.scala b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcOrganisation.scala new file mode 100644 index 0000000000..0c00417f65 --- /dev/null +++ b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcOrganisation.scala @@ -0,0 +1,28 @@ +package weco.pipeline.transformer.marc_common.transformers + +import weco.catalogue.internal_model.identifiers.IdState +import weco.catalogue.internal_model.work.{AbstractAgent, Organisation} + +object MarcOrganisation extends MarcFieldTransformer with MarcAbstractAgent { + + override protected val ontologyType: String = "Organisation" + override protected val appropriateFields: Seq[String] = + Seq("110", "610", "710") + override protected val labelSubfieldTags: Seq[String] = Seq( + "a", + "b", + "c", + "d", + "t", + "p", + "q", + "l" + ) + + override protected def createAgent( + label: String, + identifier: IdState.Unminted + ): AbstractAgent[IdState.Unminted] = + new Organisation(label = normaliseLabel(label), id = identifier) + +} diff --git a/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcPerson.scala b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcPerson.scala new file mode 100644 index 0000000000..14df0b59ea --- /dev/null +++ b/pipeline/transformer/transformer_marc_common/src/main/scala/weco/pipeline/transformer/marc_common/transformers/MarcPerson.scala @@ -0,0 +1,42 @@ +package weco.pipeline.transformer.marc_common.transformers + +import weco.catalogue.internal_model.identifiers.IdState +import weco.catalogue.internal_model.work.{AbstractAgent, Person} +import weco.pipeline.transformer.marc_common.models.MarcField + +object MarcPerson extends MarcFieldTransformer with MarcAbstractAgent { + + override protected val ontologyType: String = "Person" + override protected val appropriateFields: Seq[String] = + Seq("100", "600", "700") + override protected val labelSubfieldTags: Seq[String] = Seq( + "a", + "b", + "c", + "d", + "t", + "n", + "p", + "q", + "l" + ) + override protected def getLabel(field: MarcField): Option[String] = { + super.getLabel(field) match { + case None => None + case Some(label) => + field.marcTag match { + // Person labels should not be normalised for subjects + case "600" => Some(label) + // Person labels should be normalised for contributors + case _ => Some(normaliseLabel(normaliseLabel(label))) + } + } + } + + override protected def createAgent( + label: String, + identifier: IdState.Unminted + ): AbstractAgent[IdState.Unminted] = + new Person(label = label, id = identifier) + +} diff --git a/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcAbstractAgentBehaviours.scala b/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcAbstractAgentBehaviours.scala new file mode 100644 index 0000000000..56642ea275 --- /dev/null +++ b/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcAbstractAgentBehaviours.scala @@ -0,0 +1,107 @@ +package weco.pipeline.transformer.marc_common.transformers + +import org.scalatest.{LoneElement, TryValues} +import org.scalatest.funspec.AnyFunSpec +import org.scalatest.matchers.should.Matchers +import weco.catalogue.internal_model.identifiers.IdentifierType +import weco.pipeline.transformer.marc_common.models.{MarcField, MarcSubfield} + +trait MarcAbstractAgentBehaviours + extends AnyFunSpec + with Matchers + with TryValues + with LoneElement { + def abstractAgentTransformer( + transformer: MarcAbstractAgent, + ontologyType: String, + marcTag: String + ): Unit = { + + describe("error conditions") { + it("returns an error if called with an inappropriate field") { + transformer( + MarcField( + marcTag = "999", + subfields = Seq(MarcSubfield(tag = "a", content = "Anne A. Gent")) + ) + ).failure.exception.getMessage should startWith( + "attempt to transform incompatible MARC field" + ) + } + + it("returns an error if no label subfields are present") { + transformer( + MarcField( + marcTag = marcTag, + subfields = Nil + ) + ).failure.exception.getMessage should startWith( + "no label found when transforming" + ) + } + + it("returns an error if label subfields are empty") { + transformer( + MarcField( + marcTag = marcTag, + subfields = Seq(MarcSubfield(tag = "a", content = "")) + ) + ).failure.exception.getMessage should startWith( + "no label found when transforming" + ) + } + } + + describe("extracting a simple label") { + it("extracts the ǂa subfield as the label") { + transformer( + MarcField( + marcTag = marcTag, + subfields = Seq(MarcSubfield(tag = "a", content = "Anne A. Gent")) + ) + ).get should have( + 'label("Anne A. Gent") + ) + } + } + + describe("assigning an identifier") { + it("extracts ǂ0 as the id") { + transformer( + MarcField( + marcTag = marcTag, + subfields = Seq( + MarcSubfield(tag = "a", content = "Anne A. Gent"), + MarcSubfield(tag = "0", content = "no97058352") + ) + ) + ).get.id.allSourceIdentifiers.loneElement should have( + 'value("no97058352"), + 'ontologyType(ontologyType), + // Agents with identifiers are assumed to be + // identified under LCNames + // This is not strictly true - + // - 1xx and 7xx have no official defined value for this + // - in 6xx this is governed by ind2=0 + 'identifierType(IdentifierType.LCNames) + ) + } + it("generates a label-derived id if there is no ǂ0") { + transformer( + MarcField( + marcTag = marcTag, + subfields = Seq( + MarcSubfield(tag = "a", content = "Anne On a Moose") + ) + ) + ).get.id.allSourceIdentifiers.loneElement should have( + 'value("anne on a moose"), + 'ontologyType(ontologyType), + // Agents with identifiers are assumed to be identified under LCNames + 'identifierType(IdentifierType.LabelDerived) + ) + } + } + } + +} diff --git a/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributorsTest.scala b/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributorsTest.scala new file mode 100644 index 0000000000..78ad912cc6 --- /dev/null +++ b/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcContributorsTest.scala @@ -0,0 +1,478 @@ +package weco.pipeline.transformer.marc_common.transformers + +import org.scalatest.{Inspectors, LoneElement} +import org.scalatest.funspec.AnyFunSpec +import org.scalatest.matchers.should.Matchers +import org.scalatest.prop.TableDrivenPropertyChecks +import weco.catalogue.internal_model.work.{Agent, Person} +import weco.pipeline.transformer.marc_common.generators.MarcTestRecord +import weco.pipeline.transformer.marc_common.logging.LoggingContext +import weco.pipeline.transformer.marc_common.models.{MarcField, MarcSubfield} + +class MarcContributorsTest + extends AnyFunSpec + with Matchers + with Inspectors + with TableDrivenPropertyChecks + with LoneElement { + + private implicit val ctx: LoggingContext = LoggingContext("") + describe( + "extracting contributors from Main Entry (1xx) and Added Entry (7xx) fields" + ) { + info("https://www.loc.gov/marc/bibliographic/bd1xx.html") + info("https://www.loc.gov/marc/bibliographic/bd70x75x.html") + describe("When there are no contributors") { + it("returns nothing if no relevant fields are present") { + MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "245", + subfields = Seq(MarcSubfield(tag = "a", content = "The Title")) + ) + ) + ) + ) shouldBe Nil + } + + it("returns nothing if the only relevant fields are invalid") { + MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "110", + subfields = Nil + ), + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = " "), + MarcSubfield(tag = "b", content = ""), + MarcSubfield(tag = "c", content = " ") + ) + ) + ) + ) + ) shouldBe Nil + } + + } + + describe("When there are contributors to transform") { + it("returns a mix of primary and non-primary contributors") { + val contributors = MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "100", + subfields = Seq(MarcSubfield(tag = "a", content = "Euripedes")) + ), + MarcField( + marcTag = "110", + subfields = Seq( + MarcSubfield(tag = "a", content = "Felpersham University") + ) + ), + MarcField( + marcTag = "111", + subfields = + Seq(MarcSubfield(tag = "a", content = "Council of Elrond")) + ), + MarcField( + marcTag = "700", + subfields = + Seq(MarcSubfield(tag = "a", content = "Anil Mendem")) + ), + MarcField( + marcTag = "710", + subfields = Seq( + MarcSubfield(tag = "a", content = "University of Inverdoon") + ) + ), + MarcField( + marcTag = "711", + subfields = Seq( + MarcSubfield( + tag = "a", + content = "Xenophon's Symposium" + ) + ) + ) + ) + ) + ) + contributors.map(_.primary) should contain theSameElementsAs Seq( + true, true, true, false, false, false + ) + contributors.map(_.agent.label) should contain theSameElementsAs Seq( + "Euripedes", + "Felpersham University", + "Council of Elrond", + "Anil Mendem", + "University of Inverdoon", + "Xenophon's Symposium" + ) + } + it( + "returns primary contributors first, but otherwise respects document order" + ) { + val contributors = MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "711", + subfields = Seq( + MarcSubfield( + tag = "a", + content = "Xenophon's Symposium" + ) + ) + ), + MarcField( + marcTag = "700", + subfields = + Seq(MarcSubfield(tag = "a", content = "Anil Mendem")) + ), + MarcField( + marcTag = "100", + subfields = Seq(MarcSubfield(tag = "a", content = "Euripedes")) + ), + MarcField( + marcTag = "710", + subfields = Seq( + MarcSubfield(tag = "a", content = "University of Inverdoon") + ) + ), + MarcField( + marcTag = "110", + subfields = Seq( + MarcSubfield(tag = "a", content = "Felpersham University") + ) + ), + MarcField( + marcTag = "111", + subfields = + Seq(MarcSubfield(tag = "a", content = "Council of Elrond")) + ) + ) + ) + ) + contributors.map(_.primary) should contain theSameElementsAs Seq( + true, true, true, false, false, false + ) + contributors.map(_.agent.label) should contain theSameElementsAs Seq( + "Euripedes", + "Felpersham University", + "Council of Elrond", + "Xenophon's Symposium", + "Anil Mendem", + "University of Inverdoon" + ) + } + + it( + "returns an Agent contributor if a Person field contains subfield t: 'Title of a work'" + ) { + info( + "This is a historic quirk, it may not need to be this way, and it's probably incorrect" + ) + val contributors = MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði"), + MarcSubfield(tag = "t", content = "Codex Regius") + ) + ), + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson"), + MarcSubfield(tag = "t", content = "Gylfaginning") + ) + ) + ) + ) + ) + contributors.head.agent should be(an[Agent[_]]) + contributors(1).agent should be(an[Agent[_]]) + } + + it("harmonises Agent entries with Person entries if they match") { + info("The first two fields generate Agents due to the t subfield") + info( + "However, because fields with the same ID later generate a Person" + ) + info("the ontologytype harmonisation converts them both to a Person") + val contributors = MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði"), + MarcSubfield(tag = "t", content = "Codex Regius"), + MarcSubfield(tag = "0", content = "n97018003") + ) + ), + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson"), + MarcSubfield(tag = "t", content = "Gylfaginning"), + MarcSubfield(tag = "0", content = "n50000553") + ) + ), + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði"), + MarcSubfield(tag = "0", content = "n97018003") + ) + ), + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson"), + MarcSubfield(tag = "0", content = "n50000553") + ) + ) + ) + ) + ) + forAll(contributors) { + contributor => + contributor.agent should be(a[Person[_]]) + } + } + describe("different agent types") { + forAll( + Table( + ("tag", "agentType"), + ("100", "Person"), + ("700", "Person"), + ("110", "Organisation"), + ("710", "Organisation"), + ("111", "Meeting"), + ("711", "Meeting") + ) + ) { + (tag, agent_type) => + it(s"extracts an agent of type $agent_type from field $tag") { + MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = tag, + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði"), + MarcSubfield(tag = "e", content = "E"), + MarcSubfield(tag = "j", content = "J") + ) + ) + ) + ) + ).loneElement.agent.getClass.getName + .split('.') + .last shouldBe agent_type + } + + } + } + describe("contribution roles") { + forAll( + Table( + ("tag", "role_subfields", "expected_roles"), + ("100", "e,j", "E J"), + ("700", "e,j", "E J"), + ("110", "e", "E"), + ("710", "e", "E"), + ("111", "j", "J"), + ("711", "j", "J") + ) + ) { + (tag, role_subfields, expected_roles) => + it( + s"extracts the contribution role from field $tag from subfield(s) $role_subfields" + ) { + MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = tag, + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði"), + MarcSubfield(tag = "e", content = "E"), + MarcSubfield(tag = "j", content = "J") + ) + ) + ) + ) + ).loneElement.roles + .map(_.label) + .mkString(" ") shouldBe expected_roles + } + } + } + describe("deduplication") { + + it( + "returns only the primary contributor if that contributor is duplicated as an Added Entry" + ) { + val contributors = MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði"), + MarcSubfield(tag = "0", content = "n97018003") + ) + ), + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson"), + MarcSubfield(tag = "0", content = "n50000553") + ) + ), + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði"), + MarcSubfield(tag = "0", content = "n97018003") + ) + ), + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson"), + MarcSubfield(tag = "0", content = "n50000553") + ) + ) + ) + ) + ) + contributors should have length 2 + contributors.map(_.agent.label) should contain theSameElementsAs Seq( + "Sæmundr fróði", + "Snorri Sturluson" + ) + forAll(contributors) { + contributor => contributor.primary shouldBe true + } + } + + it( + "deduplicates by matching on the whole contributor" + ) { + info("It is only the difference in primary status that is ignored") + info("when finding and eliminating duplicate contributors") + + val fields = Seq( + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Max Bialystock"), + MarcSubfield(tag = "e", content = "producer") + ) + ), + MarcField( + // differs by presence of id + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Max Bialystock"), + MarcSubfield(tag = "0", content = "no2021024802"), + MarcSubfield(tag = "e", content = "producer") + ) + ), + MarcField( + // is the same + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Max Bialystock"), + MarcSubfield(tag = "e", content = "producer") + ) + ), + MarcField( + // differs by having a different role + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Max Bialystock"), + MarcSubfield(tag = "e", content = "director") + ) + ), + MarcField( + // differs by having no role + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Max Bialystock") + ) + ), + MarcField( + // is the same + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Max Bialystock"), + MarcSubfield(tag = "e", content = "producer") + ) + ) + ) + + val contributors = MarcContributors(MarcTestRecord(fields = fields)) + contributors should have length 4 + // not really an assertion, just to clarify that the two + // 700s that are otherwise identical to the 100 are discarded + fields should have length 6 + } + + it( + "Also removes fully identical duplicates" + ) { + val Seq(primary, secondary) = MarcContributors( + MarcTestRecord(fields = + Seq( + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði") + ) + ), + MarcField( + marcTag = "100", + subfields = Seq( + MarcSubfield(tag = "a", content = "Sæmundr fróði") + ) + ), + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson") + ) + ), + MarcField( + marcTag = "700", + subfields = Seq( + MarcSubfield(tag = "a", content = "Snorri Sturluson") + ) + ) + ) + ) + ) + + primary.primary shouldBe true + primary.agent.label shouldBe "Sæmundr fróði" + + secondary.primary shouldBe false + secondary.agent.label shouldBe "Snorri Sturluson" + + } + } + } + + } + +} diff --git a/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcMeetingTest.scala b/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcMeetingTest.scala new file mode 100644 index 0000000000..f320473f9d --- /dev/null +++ b/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcMeetingTest.scala @@ -0,0 +1,90 @@ +package weco.pipeline.transformer.marc_common.transformers + +import org.scalatest.LoneElement +import org.scalatest.prop.TableDrivenPropertyChecks +import weco.catalogue.internal_model.work.Meeting +import weco.pipeline.transformer.marc_common.models.{MarcField, MarcSubfield} + +class MarcMeetingTest + extends MarcAbstractAgentBehaviours + with LoneElement + with TableDrivenPropertyChecks { + describe("extracting Meetings from MARC fields 111, 611 and 711") { + info("https://www.loc.gov/marc/bibliographic/bdx11.html") + info("https://www.loc.gov/marc/bibliographic/bd111.html") + info("https://www.loc.gov/marc/bibliographic/bd611.html") + info("https://www.loc.gov/marc/bibliographic/bd711.html") + + it should behave like abstractAgentTransformer( + MarcMeeting, + "Meeting", + "111" + ) + + describe("permitted marcTags") { + forAll( + Table("marcTag", "111", "611", "711") + ) { + marcTag => + it(s"creates a Meeting given a field with tag $marcTag") { + MarcMeeting( + MarcField( + marcTag = marcTag, + subfields = Seq( + MarcSubfield( + tag = "a", + content = "Entmoot" + ), + MarcSubfield( + tag = "c", + content = "Derndingle" + ), + MarcSubfield( + tag = "d", + content = "TA 3019" + ) + ) + ) + ).get shouldBe an[Meeting[_]] + } + } + } + + val labelSubfieldTags = Seq( + "a", + "c", + "d", + "t" + ) + it( + s"derives the label from subFields ${labelSubfieldTags.mkString(", ")}" + ) { + + val expectedLabel = labelSubfieldTags.mkString(" ").toUpperCase() + val labelSubfields = labelSubfieldTags map { + tag => + MarcSubfield( + tag = tag, + content = tag.toUpperCase() + ) + } + + val nonLabelSubfields = Seq("n", "p", "q") map { + tag => + MarcSubfield( + tag = tag, + content = tag.toUpperCase() + ) + } + + MarcMeeting( + MarcField( + marcTag = "711", + subfields = labelSubfields ++ nonLabelSubfields + ) + ).get.label shouldBe expectedLabel + } + + } + +} diff --git a/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcOrganisationTest.scala b/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcOrganisationTest.scala new file mode 100644 index 0000000000..200dc40605 --- /dev/null +++ b/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcOrganisationTest.scala @@ -0,0 +1,85 @@ +package weco.pipeline.transformer.marc_common.transformers + +import org.scalatest.LoneElement +import org.scalatest.prop.TableDrivenPropertyChecks +import weco.catalogue.internal_model.work.Organisation +import weco.pipeline.transformer.marc_common.models.{MarcField, MarcSubfield} + +class MarcOrganisationTest + extends MarcAbstractAgentBehaviours + with LoneElement + with TableDrivenPropertyChecks { + describe("extracting Organisations from MARC fields 110, 610 and 710") { + info("https://www.loc.gov/marc/bibliographic/bdx10.html") + info("https://www.loc.gov/marc/bibliographic/bd110.html") + info("https://www.loc.gov/marc/bibliographic/bd610.html") + info("https://www.loc.gov/marc/bibliographic/bd710.html") + + it should behave like abstractAgentTransformer( + MarcOrganisation, + "Organisation", + "110" + ) + describe("permitted marcTags") { + + forAll( + Table("marcTag", "110", "610", "710") + ) { + marcTag => + it(s"creates an Organisation given a field with tag $marcTag") { + MarcOrganisation( + MarcField( + marcTag = marcTag, + subfields = Seq( + MarcSubfield( + tag = "a", + content = "The Illuminati" + ) + ) + ) + ).get shouldBe an[Organisation[_]] + } + } + } + val labelSubfieldTags = Seq( + "a", + "b", + "c", + "d", + "t", + "p", + "q", + "l" + ) + it( + s"derives the label from subFields ${labelSubfieldTags.mkString(", ")}" + ) { + + val expectedLabel = labelSubfieldTags.mkString(" ").toUpperCase() + val labelSubfields = labelSubfieldTags map { + tag => + MarcSubfield( + tag = tag, + content = tag.toUpperCase() + ) + } + + val nonLabelSubfields = Seq("n", "0", "u") map { + tag => + MarcSubfield( + tag = tag, + content = tag.toUpperCase() + ) + } + + MarcOrganisation( + MarcField( + marcTag = "710", + subfields = labelSubfields ++ nonLabelSubfields + ) + ).get.label shouldBe expectedLabel + } + + } + +} diff --git a/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcPersonTest.scala b/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcPersonTest.scala new file mode 100644 index 0000000000..1c76516a97 --- /dev/null +++ b/pipeline/transformer/transformer_marc_common/src/test/scala/weco/pipeline/transformer/marc_common/transformers/MarcPersonTest.scala @@ -0,0 +1,85 @@ +package weco.pipeline.transformer.marc_common.transformers + +import org.scalatest.LoneElement +import org.scalatest.prop.TableDrivenPropertyChecks +import weco.catalogue.internal_model.work.Person +import weco.pipeline.transformer.marc_common.models.{MarcField, MarcSubfield} + +class MarcPersonTest + extends MarcAbstractAgentBehaviours + with LoneElement + with TableDrivenPropertyChecks { + describe("extracting Persons from MARC fields 100, 600 and 700") { + info("https://www.loc.gov/marc/bibliographic/bdx00.html") + info("https://www.loc.gov/marc/bibliographic/bd100.html") + info("https://www.loc.gov/marc/bibliographic/bd600.html") + info("https://www.loc.gov/marc/bibliographic/bd700.html") + + it should behave like abstractAgentTransformer( + MarcPerson, + "Person", + "100" + ) + describe("permitted marcTags") { + forAll( + Table("marcTag", "100", "600", "700") + ) { + marcTag => + it(s"creates a Person given a field with tag $marcTag") { + MarcPerson( + MarcField( + marcTag = marcTag, + subfields = Seq( + MarcSubfield( + tag = "a", + content = "Ekalavya" + ) + ) + ) + ).get shouldBe a[Person[_]] + } + } + } + val labelSubfieldTags = Seq( + "a", + "b", + "c", + "d", + "t", + "n", + "p", + "q", + "l" + ) + it( + s"derives the label from subFields ${labelSubfieldTags.mkString(", ")}" + ) { + + val expectedLabel = labelSubfieldTags.mkString(" ").toUpperCase() + val labelSubfields = labelSubfieldTags map { + tag => + MarcSubfield( + tag = tag, + content = tag.toUpperCase() + ) + } + + val nonLabelSubfields = Seq("m", "0", "u") map { + tag => + MarcSubfield( + tag = tag, + content = tag.toUpperCase() + ) + } + + MarcPerson( + MarcField( + marcTag = "700", + subfields = labelSubfields ++ nonLabelSubfields + ) + ).get.label shouldBe expectedLabel + } + + } + +} diff --git a/pipeline/transformer/transformer_marc_xml/src/main/scala/weco/pipeline/transformer/marc/xml/transformers/MarcXMLRecordTransformer.scala b/pipeline/transformer/transformer_marc_xml/src/main/scala/weco/pipeline/transformer/marc/xml/transformers/MarcXMLRecordTransformer.scala index 213f3a6af2..8ea80c8bf6 100644 --- a/pipeline/transformer/transformer_marc_xml/src/main/scala/weco/pipeline/transformer/marc/xml/transformers/MarcXMLRecordTransformer.scala +++ b/pipeline/transformer/transformer_marc_xml/src/main/scala/weco/pipeline/transformer/marc/xml/transformers/MarcXMLRecordTransformer.scala @@ -11,6 +11,7 @@ import weco.pipeline.transformer.marc.xml.data.MarcXMLRecord import weco.pipeline.transformer.marc_common.logging.LoggingContext import weco.pipeline.transformer.marc_common.transformers.{ MarcAlternativeTitles, + MarcContributors, MarcCurrentFrequency, MarcDescription, MarcDesignation, @@ -59,7 +60,8 @@ object MarcXMLRecordTransformer { description = MarcDescription(record), currentFrequency = MarcCurrentFrequency(record), edition = MarcEdition(record), - items = MarcElectronicResources(record).toList + items = MarcElectronicResources(record).toList, + contributors = MarcContributors(record).toList ) } } diff --git a/pipeline/transformer/transformer_marc_xml/src/test/scala/weco/pipeline/transformer/marc/xml/transformers/MarcXMLRecordTransformerTest.scala b/pipeline/transformer/transformer_marc_xml/src/test/scala/weco/pipeline/transformer/marc/xml/transformers/MarcXMLRecordTransformerTest.scala index 3e85bb7079..29db5dad4c 100644 --- a/pipeline/transformer/transformer_marc_xml/src/test/scala/weco/pipeline/transformer/marc/xml/transformers/MarcXMLRecordTransformerTest.scala +++ b/pipeline/transformer/transformer_marc_xml/src/test/scala/weco/pipeline/transformer/marc/xml/transformers/MarcXMLRecordTransformerTest.scala @@ -46,6 +46,16 @@ class MarcXMLRecordTransformerTest 1477-4615 + + Nicholas Fallaize + + + SMERSH + + + Aristotle and Descartes, + Glubbdubdrib + LLyfr Coch @@ -67,6 +77,16 @@ class MarcXMLRecordTransformerTest Some of them [sc. physicians] I know are ignorant beyond Description. + + Nora Helmer + + + SPECTRE + + + James Barry and Florence Nightingale, + waiting for a train + Hampster Dance https://example.com/hampsterdance @@ -88,7 +108,7 @@ class MarcXMLRecordTransformerTest _.value ) should contain theSameElementsAs Seq("8601416781396", "1477-4615") } - + it("extracts the current frequency") { work.data.currentFrequency.get shouldBe "Sizdah Behar on even-numbered years" } @@ -112,5 +132,19 @@ class MarcXMLRecordTransformerTest .asInstanceOf[DigitalLocation] .url shouldBe "https://example.com/hampsterdance" } + + it("extracts contributors") { + work.data.contributors.map( + _.agent.label + ) should contain theSameElementsAs Seq( + "Nicholas Fallaize", + "SMERSH", + "Aristotle and Descartes, Glubbdubdrib", + "Nora Helmer", + "SPECTRE", + "James Barry and Florence Nightingale, waiting for a train" + ) + } + } } diff --git a/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/SubjectsAndContributors.scala b/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/SubjectsAndContributors.scala index 4c4df0fefd..dc620153e2 100644 --- a/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/SubjectsAndContributors.scala +++ b/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/SubjectsAndContributors.scala @@ -7,7 +7,7 @@ import weco.pipeline.transformer.sierra.transformers.{ SierraContributors, SierraSubjects } -import weco.pipeline.transformer.sierra.transformers.OntologyTypeOps._ +import weco.pipeline.transformer.marc_common.OntologyTypeOps._ import weco.sierra.models.data.SierraBibData import weco.sierra.models.identifiers.SierraBibNumber diff --git a/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/transformers/SierraContributors.scala b/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/transformers/SierraContributors.scala index 0d714e86dd..81ffe9b297 100644 --- a/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/transformers/SierraContributors.scala +++ b/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/transformers/SierraContributors.scala @@ -1,15 +1,13 @@ package weco.pipeline.transformer.sierra.transformers -import weco.catalogue.internal_model.identifiers.{ - IdState, - IdentifierType, - SourceIdentifier -} +import weco.catalogue.internal_model.identifiers.IdState import weco.catalogue.internal_model.work._ import weco.pipeline.transformer.identifiers.LabelDerivedIdentifiers +import weco.pipeline.transformer.marc_common.logging.LoggingContext +import weco.pipeline.transformer.marc_common.transformers.MarcContributors +import weco.pipeline.transformer.sierra.data.SierraMarcDataConversions import weco.sierra.models.SierraQueryOps import weco.sierra.models.data.SierraBibData -import weco.sierra.models.marc.Subfield /* Populate wwork:contributors. Rules: * @@ -37,200 +35,13 @@ object SierraContributors extends SierraDataTransformer with SierraQueryOps with SierraAgents - with LabelDerivedIdentifiers { + with LabelDerivedIdentifiers + with SierraMarcDataConversions { type Output = List[Contributor[IdState.Unminted]] - import OntologyTypeOps._ - - case class ContributorField( - marcTag: String, - roleTags: Seq[String], - isPrimary: Boolean, - getContributors: List[Subfield] => ( - String, - Option[AbstractAgent[IdState.Unminted]] - ) - ) - - // Quoting an email from Louise Grainger, dated 25 August 2022: - // - // 100 – we want valid subfield $e (relator/contributor), but NOT valid $j (attribution qualifier) - // – if no $e do not display alternative - // - // 110 – we want valid subfield $e (relator/contributor); $j is invalid in this field - // – if no $e do not display alternative - // - // 111 – we want valid subfield $j (relator/contributor); $e is invalid in this field - // – if no $j do not display alternative - // - // 700 – we want valid subfield $e (relator/contributor) AND valid subfield $j (attribution qualifier) - // – if no $e then display $j; if $e then (ideally) display $j too - // - // 710 - we want valid subfield $e (relator/contributor); $j is invalid in this field - // – if no $e do not display alternative - // - // 711 - we want valid subfield $j (relator/contributor); $e is invalid in this field - // – if no $j do not display alternative - // - val contributorFields = List( - ContributorField( - marcTag = "100", - roleTags = Seq("e"), - isPrimary = true, - getPersonContributors - ), - ContributorField( - marcTag = "110", - roleTags = Seq("e"), - isPrimary = true, - getOrganisationContributors - ), - ContributorField( - marcTag = "111", - roleTags = Seq("j"), - isPrimary = true, - getMeetingContributors - ), - ContributorField( - marcTag = "700", - roleTags = Seq("e", "j"), - isPrimary = false, - getPersonContributors - ), - ContributorField( - marcTag = "710", - roleTags = Seq("e"), - isPrimary = false, - getOrganisationContributors - ), - ContributorField( - marcTag = "711", - roleTags = Seq("j"), - isPrimary = false, - getMeetingContributors - ) - ) - def apply(bibData: SierraBibData): List[Contributor[IdState.Unminted]] = { - val allContributors = contributorFields.flatMap { - case ContributorField(marcTag, roleTags, isPrimary, getContributors) => - bibData - .varfieldsWithTag(marcTag) - .flatMap { - varfield => - val (ontologyType, maybeAgent) = - getContributors(varfield.subfields) - maybeAgent.map { - agent => - Contributor( - agent = withId( - agent, - identify(varfield.subfields, ontologyType, agent.label) - ), - roles = getContributionRoles(varfield.subfields, roleTags), - primary = isPrimary - ) - } - } - } - - // We need to remove duplicates where two contributors differ only - // by primary/not-primary - val duplicatedContributors = - allContributors - .filter(_.primary == false) - .filter(c => allContributors.contains(c.copy(primary = true))) - .toSet - - allContributors - .filterNot(c => duplicatedContributors.contains(c)) - .harmoniseOntologyTypes - } - private def getPersonContributors( - subfields: List[Subfield] - ): (String, Option[AbstractAgent[IdState.Unminted]]) = - if (subfields.withTags("t").isEmpty) - "Person" -> getPerson(subfields, normalisePerson = true) - else - "Agent" -> getLabel(subfields).map(Agent(_)) - - private def getOrganisationContributors(subfields: List[Subfield]) = - "Organisation" -> getOrganisation(subfields) - - private def getMeetingContributors(subfields: List[Subfield]) = - "Meeting" -> getMeeting(subfields) - - private def getContributionRoles( - subfields: List[Subfield], - roleTags: Seq[String] - ): List[ContributionRole] = - subfields - .withTags(roleTags: _*) - .contents - .map { - role => - // The contribution role in the raw MARC data sometimes includes a - // trailing full stop, because all the subfields are meant to be concatenated - // into a single sentence. - // - // This full stop doesn't make sense in a structured field, so remove it. - role.stripSuffix(".") - } - .map(ContributionRole) - - private def withId( - agent: AbstractAgent[IdState.Unminted], - id: IdState.Unminted - ) = - agent match { - case a: Agent[IdState.Unminted] => a.copy(id = id) - case p: Person[IdState.Unminted] => p.copy(id = id) - case o: Organisation[IdState.Unminted] => o.copy(id = id) - case m: Meeting[IdState.Unminted] => m.copy(id = id) - } - - /* Given an agent and the associated MARC subfields, look for instances of subfield $0, - * which are used for identifiers. - * - * This methods them (if present) and wraps the agent in Unidentifiable or Identifiable - * as appropriate. - */ - private def identify( - subfields: List[Subfield], - ontologyType: String, - label: String - ): IdState.Unminted = { - - // We take the contents of subfield $0. They may contain inconsistent - // spacing and punctuation, such as: - // - // " nr 82270463" - // "nr 82270463" - // "nr 82270463.," - // - // which all refer to the same identifier. - // - // For consistency, we remove all whitespace and some punctuation - // before continuing. - val codes = subfields.collect { - case Subfield("0", content) => - content.replaceAll("[.,\\s]", "") - } - - // If we get exactly one value, we can use it to identify the record. - // Some records have multiple instances of subfield $0 (it's a repeatable - // field in the MARC spec). - codes.distinct match { - case Seq(code) => - IdState.Identifiable( - SourceIdentifier( - identifierType = IdentifierType.LCNames, - value = code, - ontologyType = ontologyType - ) - ) - case _ => identifierFromText(label = label, ontologyType = ontologyType) - } + implicit val ctx: LoggingContext = LoggingContext("") + MarcContributors(bibData).toList } } diff --git a/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/transformers/SierraSubjects.scala b/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/transformers/SierraSubjects.scala index 8b9ecde147..57afca534a 100644 --- a/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/transformers/SierraSubjects.scala +++ b/pipeline/transformer/transformer_sierra/src/main/scala/weco/pipeline/transformer/sierra/transformers/SierraSubjects.scala @@ -14,7 +14,7 @@ import weco.sierra.models.identifiers.SierraBibNumber object SierraSubjects extends SierraIdentifiedDataTransformer { type Output = List[Subject[IdState.Unminted]] - import OntologyTypeOps._ + import weco.pipeline.transformer.marc_common.OntologyTypeOps._ val subjectsTransformers = List( SierraConceptSubjects, diff --git a/pipeline/transformer/transformer_sierra/src/test/scala/weco/pipeline/transformer/sierra/transformers/SierraContributorsTest.scala b/pipeline/transformer/transformer_sierra/src/test/scala/weco/pipeline/transformer/sierra/transformers/SierraContributorsTest.scala index 5c9d970182..709979bde7 100644 --- a/pipeline/transformer/transformer_sierra/src/test/scala/weco/pipeline/transformer/sierra/transformers/SierraContributorsTest.scala +++ b/pipeline/transformer/transformer_sierra/src/test/scala/weco/pipeline/transformer/sierra/transformers/SierraContributorsTest.scala @@ -16,7 +16,7 @@ import weco.pipeline.transformer.sierra.transformers.matchers.{ import weco.sierra.generators.SierraDataGenerators import weco.sierra.models.marc.{Subfield, VarField} import org.scalatest.prop.TableDrivenPropertyChecks -import org.scalatest.Inspectors +import org.scalatest.{Inspectors, LoneElement} import weco.catalogue.internal_model.identifiers.IdState.Identifiable class SierraContributorsTest @@ -27,8 +27,9 @@ class SierraContributorsTest with HasIdMatchers with SierraDataGenerators with TableDrivenPropertyChecks - with Inspectors { - import OntologyTypeOps._ + with Inspectors + with LoneElement { + import weco.pipeline.transformer.marc_common.OntologyTypeOps._ it("gets an empty contributor list from empty bib data") { SierraContributors(createSierraBibDataWith(varFields = Nil)) shouldBe Nil @@ -121,7 +122,7 @@ class SierraContributorsTest contributor.agent.asInstanceOf[AbstractAgent[IdState.Unminted]] ) contributor.agent should have( - 'label (label), + 'label(label), sourceIdentifier( ontologyType = ontologyType, identifierType = IdentifierType.LabelDerived, @@ -155,7 +156,7 @@ class SierraContributorsTest ) val contributors = SierraContributors(createSierraBibDataWith(varFields = varFields)) - all(contributors) should have('roles (Nil)) + all(contributors) should have('roles(Nil)) all(contributors.map(_.agent)) shouldBe a[Person[_]] val List(c1, c2) = contributors c1.agent.label shouldBe "Charles Emmanuel III, King of Sardinia, 1701-1773" @@ -184,7 +185,7 @@ class SierraContributorsTest val List(contributor) = SierraContributors(bibData) contributor.agent should have( - 'label ("Shakespeare, William, 1564-1616. Hamlet.") + 'label("Shakespeare, William, 1564-1616. Hamlet.") ) contributor.agent shouldBe an[Agent[IdState.Identifiable]] } @@ -217,7 +218,7 @@ class SierraContributorsTest val contributors = SierraContributors(createSierraBibDataWith(varFields = varFields)) - all(contributors) should have('roles (Nil)) + all(contributors) should have('roles(Nil)) all(contributors.map(_.agent)) shouldBe a[Person[_]] contributors.map(_.agent.label) shouldBe List(name1, name2, name3) all(contributors.tail) shouldNot be(primary) @@ -243,7 +244,7 @@ class SierraContributorsTest val List(contributor) = SierraContributors(createSierraBibDataWith(varFields = varFields)) contributor.agent shouldBe a[Person[_]] - contributor.agent should have('label (name)) + contributor.agent should have('label(name)) contributor shouldBe primary contributor should have( roles(List(role1, role2)) @@ -325,7 +326,7 @@ class SierraContributorsTest val List(contributor) = SierraContributors(createSierraBibDataWith(varFields = varFields)) contributor.agent should have( - 'label (name), + 'label(name), sourceIdentifier( identifierType = IdentifierType.LCNames, ontologyType = "Person", @@ -362,7 +363,7 @@ class SierraContributorsTest val List(contributor) = SierraContributors(createSierraBibDataWith(varFields = varFields)) contributor.agent should have( - 'label (name), + 'label(name), sourceIdentifier( identifierType = IdentifierType.LCNames, ontologyType = "Person", @@ -393,7 +394,7 @@ class SierraContributorsTest contributor.agent shouldBe a[Person[_]] contributor.agent should have( - 'label (name), + 'label(name), labelDerivedPersonId(name.toLowerCase) ) } @@ -415,7 +416,7 @@ class SierraContributorsTest val contributors = SierraContributors(createSierraBibDataWith(varFields = varFields)) contributors.size shouldBe 2 - all(contributors) should have('roles (Nil)) + all(contributors) should have('roles(Nil)) all(contributors.map(_.agent)) shouldBe a[Person[_]] contributors.map(_.agent.label) shouldBe List("George", "Sebastian") contributors.head shouldBe primary @@ -438,7 +439,7 @@ class SierraContributorsTest SierraContributors(createSierraBibDataWith(varFields = varFields)) contributor.agent shouldBe an[Organisation[_]] contributor.agent should have( - 'label (name) + 'label(name) ) } @@ -469,7 +470,7 @@ class SierraContributorsTest SierraContributors(createSierraBibDataWith(varFields = varFields)) contributor.agent shouldBe an[Organisation[_]] contributor.agent should have( - 'label ( + 'label( "IARC Working Group on the Evaluation of the Carcinogenic Risk of Chemicals to Man. Meeting 1972 : Lyon, France" ) ) @@ -502,7 +503,7 @@ class SierraContributorsTest val contributors = SierraContributors(createSierraBibDataWith(varFields = varFields)) - all(contributors) should have('roles (Nil)) + all(contributors) should have('roles(Nil)) all(contributors.map(_.agent)) shouldBe a[Organisation[_]] contributors.map(_.agent.label) shouldBe List(name1, name2, name3) contributors.head shouldBe primary @@ -528,7 +529,7 @@ class SierraContributorsTest val List(contributor) = SierraContributors(createSierraBibDataWith(varFields = varFields)) contributor.agent shouldBe an[Organisation[_]] - contributor.agent should have('label (name)) + contributor.agent should have('label(name)) contributor should have( roles(List(role1, role2)) ) @@ -551,7 +552,7 @@ class SierraContributorsTest val List(contributor) = SierraContributors(createSierraBibDataWith(varFields = varFields)) contributor.agent should have( - 'label (name), + 'label(name), sourceIdentifier( identifierType = IdentifierType.LCNames, ontologyType = "Organisation", @@ -584,7 +585,7 @@ class SierraContributorsTest val List(contributor) = SierraContributors(createSierraBibDataWith(varFields = varFields)) contributor.agent should have( - 'label (name), + 'label(name), sourceIdentifier( identifierType = IdentifierType.LCNames, ontologyType = "Organisation", @@ -614,7 +615,7 @@ class SierraContributorsTest contributor.agent shouldBe an[Organisation[_]] contributor.agent should have( - 'label (name), + 'label(name), labelDerivedOrganisationId(name.toLowerCase) ) } @@ -635,7 +636,7 @@ class SierraContributorsTest val contributors = SierraContributors(createSierraBibDataWith(varFields = varFields)) contributors should have size 2 - all(contributors) should have('roles (Nil)) + all(contributors) should have('roles(Nil)) all(contributors.map(_.agent)) shouldBe a[Organisation[_]] contributors.map(_.agent.label) shouldBe List( "The organisation", @@ -658,7 +659,9 @@ class SierraContributorsTest ) ) ) - SierraContributors(createSierraBibDataWith(varFields = varFields)) shouldBe Nil + SierraContributors( + createSierraBibDataWith(varFields = varFields) + ) shouldBe Nil } describe("Meeting") { @@ -669,7 +672,7 @@ class SierraContributorsTest ) val List(contributor) = SierraContributors(createSierraBibDataWith(varFields = List(varField))) - contributor.agent should have('label ("Big meeting")) + contributor.agent should have('label("Big meeting")) contributor.agent shouldBe a[Meeting[_]] } @@ -680,7 +683,7 @@ class SierraContributorsTest ) val List(contributor) = SierraContributors(createSierraBibDataWith(varFields = List(varField))) - contributor.agent should have('label ("Big meeting")) + contributor.agent should have('label("Big meeting")) contributor.agent shouldBe a[Meeting[_]] contributor shouldNot be(primary) @@ -699,7 +702,7 @@ class SierraContributorsTest ) val List(contributor) = SierraContributors(createSierraBibDataWith(varFields = List(varField))) - contributor.agent should have('label ("1 2 3 4")) + contributor.agent should have('label("1 2 3 4")) contributor shouldBe primary } @@ -715,7 +718,7 @@ class SierraContributorsTest ) val List(contributor) = SierraContributors(createSierraBibDataWith(varFields = List(varField))) - contributor.agent should have('label ("label")) + contributor.agent should have('label("label")) contributor should have( roles(List("1st role", "2nd role")) ) @@ -732,7 +735,7 @@ class SierraContributorsTest val List(contributor) = SierraContributors(createSierraBibDataWith(varFields = List(varField))) contributor.agent should have( - 'label ("label"), + 'label("label"), sourceIdentifier( identifierType = IdentifierType.LCNames, ontologyType = "Meeting", @@ -766,14 +769,15 @@ class SierraContributorsTest ) val bibData = createSierraBibDataWith(varFields = varFields) - val List(contributor) = SierraContributors(bibData) + val contributor = SierraContributors(bibData).loneElement + contributor should have( - 'roles (Nil) + 'roles(Nil) ) contributor.agent shouldBe a[Person[_]] contributor shouldBe primary contributor.agent should have( - 'label ("Steele, Richard, Sir, 1672-1729.") + 'label("Steele, Richard, Sir, 1672-1729.") ) } @@ -798,7 +802,7 @@ class SierraContributorsTest val List(contributor) = SierraContributors(bibData) contributor shouldNot be(primary) contributor.agent should have( - 'label ("Brewer, George. Essays after the manner of Goldsmith, No. 1-22.") + 'label("Brewer, George. Essays after the manner of Goldsmith, No. 1-22.") ) } @@ -816,7 +820,10 @@ class SierraContributorsTest ), Subfield(tag = "l", content = "Latin."), Subfield(tag = "f", content = "1561."), - Subfield(tag = "0", content = "n 79005643") // This identifier is for Hippocrates only + Subfield( + tag = "0", + content = "n 79005643" + ) // This identifier is for Hippocrates only ) ) ) @@ -929,9 +936,10 @@ class SierraContributorsTest // Both subjects should be represented in the output list. They have different labels, so are different objects. contributors.length shouldBe 2 val commonClass = contributors(1).agent.getClass - forAll(contributors) { contributor => - contributor.agent.getClass shouldBe commonClass - contributor.agent.id.allSourceIdentifiers.head.ontologyType shouldBe specificType + forAll(contributors) { + contributor => + contributor.agent.getClass shouldBe commonClass + contributor.agent.id.allSourceIdentifiers.head.ontologyType shouldBe specificType } } } @@ -979,7 +987,11 @@ class SierraContributorsTest roles = Nil ) val contributors = - List(vagueContributor, specificContributor1, vagueContributor2).harmoniseOntologyTypes + List( + vagueContributor, + specificContributor1, + vagueContributor2 + ).harmoniseOntologyTypes // They all had the same label before, so they are no longer unique having harmonised their types contributors.length shouldBe 1 @@ -1030,16 +1042,21 @@ class SierraContributorsTest roles = Nil ) val contributors = - List(specificContributor1, specificContributor2, vagueContributor).harmoniseOntologyTypes + List( + specificContributor1, + specificContributor2, + vagueContributor + ).harmoniseOntologyTypes contributors.length shouldBe 3 - forAll(contributors) { contributor => - contributor.agent - .asInstanceOf[Person[IdState.Identifiable]] - .id - .sourceIdentifier - .ontologyType shouldBe "Person" + forAll(contributors) { + contributor => + contributor.agent + .asInstanceOf[Person[IdState.Identifiable]] + .id + .sourceIdentifier + .ontologyType shouldBe "Person" } } } diff --git a/pipeline/transformer/transformer_sierra/src/test/scala/weco/pipeline/transformer/sierra/transformers/SierraSubjectsTest.scala b/pipeline/transformer/transformer_sierra/src/test/scala/weco/pipeline/transformer/sierra/transformers/SierraSubjectsTest.scala index cfd3cd43b5..7df945e7d4 100644 --- a/pipeline/transformer/transformer_sierra/src/test/scala/weco/pipeline/transformer/sierra/transformers/SierraSubjectsTest.scala +++ b/pipeline/transformer/transformer_sierra/src/test/scala/weco/pipeline/transformer/sierra/transformers/SierraSubjectsTest.scala @@ -25,7 +25,7 @@ class SierraSubjectsTest with SierraDataGenerators with TableDrivenPropertyChecks with Inspectors { - import OntologyTypeOps._ + import weco.pipeline.transformer.marc_common.OntologyTypeOps._ it("deduplicates identical subjects") { // This is based on b2506728x. The different second indicators @@ -51,13 +51,13 @@ class SierraSubjectsTest ) val List(subject) = SierraSubjects(createSierraBibNumber, bibData) subject should have( - 'label ("Medicine"), + 'label("Medicine"), labelDerivedConceptId("medicine") ) val List(concept) = subject.concepts concept should have( - 'label ("Medicine"), + 'label("Medicine"), labelDerivedConceptId("medicine") ) } @@ -77,7 +77,7 @@ class SierraSubjectsTest ) val List(subject) = SierraSubjects(createSierraBibNumber, bibData) subject should have( - 'label ("Medicine"), + 'label("Medicine"), sourceIdentifier( value = "sh85083064", ontologyType = "Concept", @@ -87,7 +87,7 @@ class SierraSubjectsTest val List(concept) = subject.concepts concept should have( - 'label ("Medicine"), + 'label("Medicine"), sourceIdentifier( value = "sh85083064", ontologyType = "Concept", @@ -114,39 +114,41 @@ class SierraSubjectsTest // itself "vague" ("Concept", "Agent") ) - ) { (vagueType, specificType) => - val vagueSubject = new Subject( - label = "Maimonides, in his work on Logic", - id = Identifiable( - SourceIdentifier( - IdentifierType.LCSubjects, - vagueType, - "sh00000000" + ) { + (vagueType, specificType) => + val vagueSubject = new Subject( + label = "Maimonides, in his work on Logic", + id = Identifiable( + SourceIdentifier( + IdentifierType.LCSubjects, + vagueType, + "sh00000000" + ) ) ) - ) - val specificSubject = new Subject( - label = "Maimonides", - id = Identifiable( - SourceIdentifier( - IdentifierType.LCSubjects, - specificType, - "sh00000000" + val specificSubject = new Subject( + label = "Maimonides", + id = Identifiable( + SourceIdentifier( + IdentifierType.LCSubjects, + specificType, + "sh00000000" + ) ) ) - ) - val subjects = - List(specificSubject, vagueSubject).harmoniseOntologyTypes - // Both subjects should be represented in the output list. They have different labels, so are different objects. - subjects.length shouldBe 2 + val subjects = + List(specificSubject, vagueSubject).harmoniseOntologyTypes + // Both subjects should be represented in the output list. They have different labels, so are different objects. + subjects.length shouldBe 2 - forAll(subjects) { subject => - subject - .asInstanceOf[Subject[IdState.Identifiable]] - .id - .sourceIdentifier - .ontologyType shouldBe specificType - } + forAll(subjects) { + subject => + subject + .asInstanceOf[Subject[IdState.Identifiable]] + .id + .sourceIdentifier + .ontologyType shouldBe specificType + } } } @@ -182,7 +184,11 @@ class SierraSubjectsTest ) ) val subjects = - List(specificSubject1, specificSubject2, vagueSubject).harmoniseOntologyTypes + List( + specificSubject1, + specificSubject2, + vagueSubject + ).harmoniseOntologyTypes // They all had the same label before, so they are no longer unique having harmoniseOntologyTypesd their types subjects.length shouldBe 1 @@ -225,16 +231,21 @@ class SierraSubjectsTest ) ) val subjects = - List(specificSubject1, specificSubject2, vagueSubject).harmoniseOntologyTypes + List( + specificSubject1, + specificSubject2, + vagueSubject + ).harmoniseOntologyTypes subjects.length shouldBe 3 - forAll(subjects) { subject => - subject - .asInstanceOf[Subject[IdState.Identifiable]] - .id - .sourceIdentifier - .ontologyType shouldBe "Person" + forAll(subjects) { + subject => + subject + .asInstanceOf[Subject[IdState.Identifiable]] + .id + .sourceIdentifier + .ontologyType shouldBe "Person" } } @@ -269,12 +280,13 @@ class SierraSubjectsTest val subjects = List(specificSubject, vagueSubject).harmoniseOntologyTypes subjects.length shouldBe 2 - forAll(subjects) { subject => - subject.concepts.head - .asInstanceOf[Person[IdState.Identifiable]] - .id - .sourceIdentifier - .ontologyType shouldBe "Person" + forAll(subjects) { + subject => + subject.concepts.head + .asInstanceOf[Person[IdState.Identifiable]] + .id + .sourceIdentifier + .ontologyType shouldBe "Person" } } @@ -331,20 +343,22 @@ class SierraSubjectsTest subjects.length shouldBe 2 // The vague subject should be changed to be Person - forAll(subjects) { subject => - subject - .asInstanceOf[Subject[IdState.Identifiable]] - .id - .sourceIdentifier - .ontologyType shouldBe "Person" + forAll(subjects) { + subject => + subject + .asInstanceOf[Subject[IdState.Identifiable]] + .id + .sourceIdentifier + .ontologyType shouldBe "Person" } // But the concepts list is left unchanged. - forAll(subjects.head.concepts) { concept => - concept - .asInstanceOf[Concept[IdState.Identifiable]] - .id - .sourceIdentifier - .ontologyType shouldBe "Concept" + forAll(subjects.head.concepts) { + concept => + concept + .asInstanceOf[Concept[IdState.Identifiable]] + .id + .sourceIdentifier + .ontologyType shouldBe "Concept" } }