Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Marc Contributors #2605

Merged
merged 27 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
702081b
extract ISBN/ISSN to common
paul-butcher Mar 13, 2024
c143626
extract 020 and 022 from XML
paul-butcher Mar 13, 2024
57c0572
add Edition to common marc
paul-butcher Mar 13, 2024
6af9e88
Implement common MarcElectronicResources
paul-butcher Mar 14, 2024
e5c66c9
Apply auto-formatting rules
weco-bot Mar 14, 2024
b3d6c5f
move loggingContext around
paul-butcher Mar 15, 2024
6b39d04
sort out the ElectronicResources interface
paul-butcher Mar 15, 2024
f675d30
Merge branch 'main' into marc-electronic-resources
paul-butcher Mar 15, 2024
7c0c0ba
remove redundant TODO
paul-butcher Mar 15, 2024
b05c5d8
resources now populate a default linkeText
paul-butcher Mar 18, 2024
91278de
Update pipeline/transformer/transformer_marc_common/src/test/scala/we…
paul-butcher Mar 18, 2024
ffd6a4e
fix merge errors
paul-butcher Mar 18, 2024
bad238c
warn on empty labels
paul-butcher Mar 18, 2024
63b86e2
implement common MARC Designation
paul-butcher Mar 19, 2024
e7e616d
add logging context to Designation
paul-butcher Mar 19, 2024
802ba95
actually commit the files this time
paul-butcher Mar 19, 2024
b4a4d41
Marc description (#2595)
paul-butcher Mar 20, 2024
cb0cdda
move contributor transformation to marc common
paul-butcher Apr 4, 2024
5aec1e8
Apply auto-formatting rules
weco-bot Apr 4, 2024
718b95a
Merge branch 'main' into marc-agents
paul-butcher Apr 4, 2024
c633f47
merge
paul-butcher Apr 4, 2024
b2d471e
merge
paul-butcher Apr 4, 2024
da8ed41
add contributors to XML transformer
paul-butcher Apr 4, 2024
b46b5b4
add source-independent contributor tests
paul-butcher Apr 5, 2024
5939755
add tests for agent types and contribution roles
paul-butcher Apr 5, 2024
9f2580f
fix merge error
paul-butcher Apr 5, 2024
a6df260
Remove dead imports
paul-butcher Apr 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package weco.pipeline.transformer.sierra.transformers
package weco.pipeline.transformer.marc_common

import grizzled.slf4j.Logging
import weco.catalogue.internal_model.identifiers.{IdState, SourceIdentifier}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package weco.pipeline.transformer.marc_common.transformers

import weco.catalogue.internal_model.identifiers.{
IdState,
IdentifierType,
SourceIdentifier
}
import weco.catalogue.internal_model.work.AbstractAgent
import weco.pipeline.transformer.identifiers.LabelDerivedIdentifiers
import weco.pipeline.transformer.marc_common.models.{MarcField, MarcSubfield}
import weco.pipeline.transformer.text.TextNormalisation.TextNormalisationOps

import scala.util.{Failure, Success, Try}

trait MarcAbstractAgent extends LabelDerivedIdentifiers {
type Output = Try[AbstractAgent[IdState.Unminted]]

protected val ontologyType: String
protected val appropriateFields: Seq[String]
protected val labelSubfieldTags: Seq[String]
protected def createAgent(
label: String,
identifier: IdState.Unminted
): AbstractAgent[IdState.Unminted]

/** Construct a label from the subfields representing an agent.
*/
protected def getLabel(field: MarcField): Option[String] = {
val contents =
field.subfields
.filter {
s => labelSubfieldTags.contains(s.tag)
}
.filterNot { _.content.trim.isEmpty }
.map { _.content }

contents match {
case Nil => None
case nonEmptyList => Some(nonEmptyList mkString " ")
}
}

protected def normaliseLabel(label: String): String = label.trimTrailing(',')

/* Given an agent and the associated MARC subfields, look for instances of subfield $0,
* which are used for identifiers.
* TODO: UPDATE THIS COMMENT
* This methods them (if present) and wraps the agent in Unidentifiable or Identifiable
* as appropriate.
*/
// TODO: Consider if this might be a trait in its own right.
// Does it apply to things that are not agents?
// Yes!
protected def getIdentifier(
subfields: Seq[MarcSubfield],
label: String
): IdState.Unminted = {

// We take the contents of subfield $0. They may contain inconsistent
// spacing and punctuation, such as:
//
// " nr 82270463"
// "nr 82270463"
// "nr 82270463.,"
//
// which all refer to the same identifier.
//
// For consistency, we remove all whitespace and some punctuation
// before continuing.
val codes = subfields.collect {
case MarcSubfield("0", content) =>
content.replaceAll("[.,\\s]", "")
}

// If we get exactly one value, we can use it to identify the record.
// Some records have multiple instances of subfield $0 (it's a repeatable
// field in the MARC spec).
codes.distinct match {
case Seq(code) =>
IdState.Identifiable(
SourceIdentifier(
identifierType = IdentifierType.LCNames,
value = code,
ontologyType = ontologyType
)
)
case _ => identifierFromText(label = label, ontologyType = ontologyType)
}
}
private def isAppropriateField(field: MarcField): Boolean =
appropriateFields.contains(field.marcTag)

def apply(
field: MarcField
): Try[AbstractAgent[IdState.Unminted]] = {
if (isAppropriateField(field)) {
getLabel(field) match {
case Some(label) =>
Success(
createAgent(
label,
getIdentifier(
subfields = field.subfields,
label = label
)
)
)
case None =>
Failure(
new Exception(
s"no label found when transforming $field into an $ontologyType"
)
)
}
} else {
Failure(
new Exception(
s"attempt to transform incompatible MARC field ${field.marcTag} into $ontologyType"
)
)
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package weco.pipeline.transformer.marc_common.transformers

import weco.catalogue.internal_model.identifiers.IdState
import weco.catalogue.internal_model.work.{AbstractAgent, Agent}

object MarcAgent extends MarcFieldTransformer with MarcAbstractAgent {

override protected val ontologyType: String = "Agent"
override protected val appropriateFields: Seq[String] =
Seq("100", "600", "700")
override protected val labelSubfieldTags: Seq[String] = Seq(
"a",
"b",
"c",
"d",
"t",
"n",
"p",
"q",
"l"
)

override protected def createAgent(
label: String,
identifier: IdState.Unminted
): AbstractAgent[IdState.Unminted] =
new Agent(label = label, id = identifier)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package weco.pipeline.transformer.marc_common.transformers

import weco.catalogue.internal_model.work.ContributionRole
import weco.pipeline.transformer.marc_common.models.MarcField

object MarcContributionRoles extends MarcFieldTransformer {
type Output = Seq[ContributionRole]
private def roleSubfieldCodes(marcTag: String): Seq[String] =
marcTag.substring(1) match {
case "00" => Seq("e", "j")
case "10" => Seq("e")
case "11" => Seq("j")
}

override def apply(field: MarcField): Seq[ContributionRole] =
field.subfields
.filter(
subfield => roleSubfieldCodes(field.marcTag).contains(subfield.tag)
)
.map(_.content)
// The contribution role in the raw MARC data sometimes includes a
// trailing full stop, because all the subfields are meant to be concatenated
// into a single sentence.
//
// This full stop doesn't make sense in a structured field, so remove it.
.map(_.stripSuffix("."))
.map(ContributionRole)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package weco.pipeline.transformer.marc_common.transformers

import grizzled.slf4j.Logging
import weco.catalogue.internal_model.identifiers.IdState
import weco.catalogue.internal_model.work.Contributor

import weco.pipeline.transformer.marc_common.logging.LoggingContext
import weco.pipeline.transformer.marc_common.models.{MarcField, MarcRecord}

import scala.util.{Failure, Success}

/* Populate wwork:contributors. Rules:
*
* A contributor may be primary (1xx) or not (7xx)
* A contributor may be a person (x00), organisation (x10) or meeting (x11)
*
* For Persons and Organisations, subfield $e is used for the labels in "roles".
*
* Note: for MARC tag 700, we want to type as "Agent" rather than "Person"
* if there's a subfield "t", as this may indicate something more specific.
* e.g. some MARC records have "Hamlet", the fictional character as a 700 entry.
* We'll add a more specific type later, but "Person" isn't appropriate.
*
* Order by MARC tag (100, 110, 700, 710), then by order of appearance
* in the MARC data.
*
* https://www.loc.gov/marc/bibliographic/bd100.html
* https://www.loc.gov/marc/bibliographic/bd110.html
* https://www.loc.gov/marc/bibliographic/bd700.html
* https://www.loc.gov/marc/bibliographic/bd710.html
*
*/

object MarcContributors
extends MarcDataTransformerWithLoggingContext
with Logging {
import weco.pipeline.transformer.marc_common.OntologyTypeOps._

type Output = Seq[Contributor[IdState.Unminted]]

def apply(record: MarcRecord)(implicit ctx: LoggingContext): Output = {
val primaries = record
.fieldsWithTags("100", "110", "111")
val secondaries = record.fieldsWithTags("700", "710", "711")
filterDuplicates(
(primaries ++ secondaries)
.flatMap(field => singleContributor(field))
).toList.harmoniseOntologyTypes
}
private def isPrimary(marcTag: String): Boolean =
marcTag.startsWith("1")

/** Remove non-primary contributors who are also present as primary
* contributors
*
* It is possible that the input MARC may have some contributors who are
* mentioned as both primary (1xx - Main Entry...) and non-primary (7xx -
* Added Entry...) contributors.
*
* We do not want this duplication in the output.
*/
private def filterDuplicates(allContributors: Output): Output = {
val duplicatedContributors =
allContributors
.filter(_.primary == false)
.filter(c => allContributors.contains(c.copy(primary = true)))
.toSet

allContributors
.filterNot(c => duplicatedContributors.contains(c))
}
private def singleContributor(
field: MarcField
)(implicit ctx: LoggingContext): Option[Contributor[IdState.Unminted]] = {
(field.marcTag.substring(1) match {
// Some "Person" entries cannot be reliably determined to be an actual
// for-real-life Person, so we make them Agents
// This is a weird Sierra-specific hack and I don't think it
// should exist, so I'm not going to put in all the effort to
// extract this specific little bit of behaviour into a Sierra-specific
// transformer
case "00" if field.subfields.exists(_.tag == "t") => MarcAgent(field)
case "00" => MarcPerson(field)
case "10" => MarcOrganisation(field)
case "11" => MarcMeeting(field)
}) match {
case Failure(exception) =>
// Log and ignore. A broken Agent contributor is not
// worth throwing out the whole record
error(ctx(exception.getMessage))
None
case Success(agent) =>
Some(
Contributor(
agent = agent,
roles = MarcContributionRoles(field).toList,
primary = isPrimary(field.marcTag)
)
)
}
}
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
package weco.pipeline.transformer.marc_common.transformers

import weco.pipeline.transformer.marc_common.logging.LoggingContext
import weco.pipeline.transformer.marc_common.models.{MarcField, MarcRecord}
import weco.pipeline.transformer.marc_common.models.MarcRecord

/*
* A MarcDataTransformer finds the appropriate field(s) within a
* MarcRecord, and transforms them into the target output.
*
* */
trait MarcDataTransformer {
type Output

Expand All @@ -16,3 +22,21 @@ trait MarcDataTransformerWithLoggingContext {
implicit ctx: LoggingContext
): Output
}

/*
* A MarcFieldTransformer transforms a given MarcField into the target
* output.
*
* This allows for fields that that may have subtly different final outputs
* (as governed by a MarcDataTransformer) depending on which field it is.
*
* For example, an Organisation or a Person may be a subject or a contributor.
* The FieldTransformer can generate a Person, regardless of which it is
* while the corresponding DataTransformer sets the Subjectness or Contributorness
* of it.
* */
trait MarcFieldTransformer {
type Output

def apply(field: MarcField): Output
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package weco.pipeline.transformer.marc_common.transformers

import weco.catalogue.internal_model.identifiers.IdState
import weco.catalogue.internal_model.work.{AbstractAgent, Meeting}

object MarcMeeting extends MarcFieldTransformer with MarcAbstractAgent {

override protected val ontologyType: String = "Meeting"
override protected val appropriateFields: Seq[String] =
Seq("111", "611", "711")
override protected val labelSubfieldTags: Seq[String] = Seq(
"a",
"c",
"d",
"t"
)

override protected def createAgent(
label: String,
identifier: IdState.Unminted
): AbstractAgent[IdState.Unminted] =
new Meeting(label = normaliseLabel(label), id = identifier)

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package weco.pipeline.transformer.marc_common.transformers

import weco.catalogue.internal_model.identifiers.IdState
import weco.catalogue.internal_model.work.{AbstractAgent, Organisation}

object MarcOrganisation extends MarcFieldTransformer with MarcAbstractAgent {

override protected val ontologyType: String = "Organisation"
override protected val appropriateFields: Seq[String] =
Seq("110", "610", "710")
override protected val labelSubfieldTags: Seq[String] = Seq(
"a",
"b",
"c",
"d",
"t",
"p",
"q",
"l"
)

override protected def createAgent(
label: String,
identifier: IdState.Unminted
): AbstractAgent[IdState.Unminted] =
new Organisation(label = normaliseLabel(label), id = identifier)

}
Loading
Loading