zingg icon indicating copy to clipboard operation
zingg copied to clipboard

label between two distinct datasets?

Open havardox opened this issue 6 months ago • 2 comments

I have two datasets: a "corpus" and a "query" database. I need to do active labeling only between those two datasets as the values themselves are already distinct for each dataset. Is that possible? Here's my current code:

from zingg.client import *
from zingg.pipes import *
import sys

# Set up arguments for Zingg
args = Arguments()

# Phase name to be passed as a command line argument
phase_name = sys.argv[1]

# Define fields that correspond to the SQL table columns
query_id = FieldDefinition("query_id", "string", MatchType.DONT_USE)
corpus_id = FieldDefinition("corpus_id", "string", MatchType.DONT_USE)
title = FieldDefinition("title", "string", MatchType.FUZZY)
year_published = FieldDefinition("year_published", "string", MatchType.NUMERIC, MatchType.EXACT, MatchType.NULL_OR_BLANK)
authors = FieldDefinition("authors", "string", MatchType.FUZZY, MatchType.NULL_OR_BLANK)
part_number = FieldDefinition("part_number", "string", MatchType.FUZZY, MatchType.NULL_OR_BLANK)
isbn = FieldDefinition("isbn", "string", MatchType.FUZZY, MatchType.NULL_OR_BLANK)

# Group fields into a list
fieldDefs = [query_id, corpus_id, title, year_published, authors, part_number, isbn]

# Set field definitions in the arguments
args.setFieldDefinition(fieldDefs)

# Define the input pipe with the `query` table
queryData = Pipe("queryData", "jdbc")
queryData.addProperty(
    "url",
    f"jdbc:postgresql://{os.getenv('DATABASE_HOST')}:{os.getenv('DATABASE_PORT')}/book_linker_test",
)
queryData.addProperty("dbtable", "query")
queryData.addProperty("driver", "org.postgresql.Driver")
queryData.addProperty("user", os.getenv("DATABASE_USER"))
queryData.addProperty("password", os.getenv("DATABASE_PASSWORD"))

# Define the input pipe with the `corpus` table
corpusData = Pipe("corpusData", "jdbc")
corpusData.addProperty(
    "url",
    f"jdbc:postgresql://{os.getenv('DATABASE_HOST')}:{os.getenv('DATABASE_PORT')}/book_linker_test",
)
corpusData.addProperty("dbtable", "corpus")
corpusData.addProperty("driver", "org.postgresql.Driver")
corpusData.addProperty("user", os.getenv("DATABASE_USER"))
corpusData.addProperty("password", os.getenv("DATABASE_PASSWORD"))

# Add the input pipes
args.setData(queryData, corpusData)

# Define the output pipe
booksIdentitiesResolved = Pipe("booksIdentitiesResolved", "jdbc")
booksIdentitiesResolved.addProperty(
    "url",
    f"jdbc:postgresql://{os.getenv('DATABASE_HOST')}:{os.getenv('DATABASE_PORT')}/book_linker_test",
)
booksIdentitiesResolved.addProperty("dbtable", "books_unified")
booksIdentitiesResolved.addProperty("driver", "org.postgresql.Driver")
booksIdentitiesResolved.addProperty("user", os.getenv("DATABASE_USER"))
booksIdentitiesResolved.addProperty("password", os.getenv("DATABASE_PASSWORD"))

# Add the output pipe to arguments
args.setOutput(booksIdentitiesResolved)

# Model and execution settings
args.setModelId("books_model")
args.setZinggDir("test_models")
args.setNumPartitions(4)
args.setLabelDataSampleSize(0.5)

# Zingg execution options
options = ClientOptions([ClientOptions.PHASE, phase_name])

# Execute Zingg with the provided phase
zingg = Zingg(args, options)
zingg.initAndExecute()

Running zingg.sh {zingg.conf} --run {python_file} label only selects samples from the "corpus" as the corpus has about 100k records and the query dataset 9k. That's not what I want, I only care about the differences between the query and corpus database.

havardox avatar Aug 17 '24 16:08 havardox