fast-krippendorff
fast-krippendorff copied to clipboard
No documentation to run from a pandas dataframe
Pandas Dataframe is the most used tool to load CSVs. Please incorporate the documentation to calculate the reliability matrix from the data frames.
I have managed to make below Dataframe, same as standardly used reliability_data in this project. (Ignore the decimals, it's necessary for me). Assertion will verify that those are the same if you want to check
I also replaced null values with "N/A", as I'm reading excels
pd.read_excel('file.xlsx', na_values='N/A')
df = pd.DataFrame({'rater1': ["N/A", "N/A", "N/A", "N/A", "N/A",3,4,1,2,1,1,3,3,"N/A",3],
'rater2': [1, "N/A", 2, 1, 3,3,4,3,"N/A","N/A","N/A","N/A","N/A","N/A","N/A"],
'rater3': ["N/A", "N/A", 2, 1, 3,4,4,"N/A",2,1,1,3,3,"N/A",4]})
data = df.T.values.tolist()
data_tuple = tuple(' '.join(map(str, row)) for row in data)
reliability_data_str = (
"* * * * * 3 4 1 2 1 1 3 3 * 3", # coder A
"1 * 2 1 3 3 4 3 * * * * * * *", # coder B
"* * 2 1 3 4 4 * 2 1 1 3 3 * 4", # coder C
)
print(reliability_data_str)
print(data_tuple)
newlistconvert =[[np.nan if (v == "*" or v=="N/A") else v for v in coder.split()] for coder in data_tuple]
reliability_data = [[np.nan if (v == "*" or v=="N/A") else v for v in coder.split()] for coder in reliability_data_str]
My full implementation from a while back, with a redundant single rater logic that was fixed after I have wrote the code
def get_krippendorff_DF(inDataFrame):
data = inDataFrame.T.values.tolist()
data_tuple = tuple(' '.join(map(str, row)) for row in data)
if len(data_tuple) == 1:
return 1 #if only one coder, return 1
else:
newlistconvert = [[round(-(1/float(val))+2, 4) if isinstance(val, (int, float)) and float(val) < 1 else round(float(val), 4)
if val != "*" and val != "N/A" else np.nan for val in coder.split()] for coder in data_tuple]
#convert into continous scale, instead of 0.25 and so on
unique_values = list(set([val for sublist in newlistconvert for val in sublist]))
unique_values.sort()
return (krippendorff.alpha(reliability_data=newlistconvert, level_of_measurement="ordinal",value_domain=unique_values)) #calculate krippendorff alpha
If you add this the below code into line 310 of krippendorff.krippendorff.py it will process it naturally
if type(reliability_data).__name__ == "DataFrame":
data = reliability_data.T.values.tolist()
data_tuple = tuple(' '.join(map(str, row)) for row in data)
reliability_data = [[round(-(1/float(val))+2, 4) if isinstance(val, (int, float)) and float(val) < 1 else round(float(val), 4)
if val != "*" and val != "N/A" else np.nan for val in coder.split()] for coder in data_tuple]
value_domain = list(set([val for sublist in reliability_data for val in sublist])).sort()
it should look like this
if (reliability_data is None) == (value_counts is None):
raise ValueError("Either reliability_data or value_counts must be provided, but not both.")
if type(reliability_data).__name__ == "DataFrame":
data = reliability_data.T.values.tolist()
data_tuple = tuple(' '.join(map(str, row)) for row in data)
reliability_data = [[round(-(1/float(val))+2, 4) if isinstance(val, (int, float)) and float(val) < 1 else round(float(val), 4)
if val != "*" and val != "N/A" else np.nan for val in coder.split()] for coder in data_tuple]
value_domain = list(set([val for sublist in reliability_data for val in sublist])).sort()
# Don't know if it's a `list` or NumPy array. If it's the latter, the truth value is ambiguous. So, ask for `None`.
if value_counts is None:
reliability_data = np.asarray(reliability_data)
Expected input, where df is what should be passed as reliability_data
import pandas as pd
data = {
"coder A": ["*", "*", "*", "*", "*", "3", "4", "1", "2", "1", "1", "3", "3", "*", "3"],
"coder B": ["1", "*", "2", "1", "3", "3", "4", "3", "*", "*", "*", "*", "*", "*", "*"],
"coder C": ["*", "*", "2", "1", "3", "4", "4", "*", "2", "1", "1", "3", "3", "*", "4"]
}
df = pd.DataFrame(data)