snp-dists icon indicating copy to clipboard operation
snp-dists copied to clipboard

Reduce molten output to unique pairs

Open amilesj opened this issue 3 years ago • 2 comments

Making a new request for the enhancement suggested in a previous comment (https://github.com/tseemann/snp-dists/issues/39#issuecomment-654909438) to make molten output only unique pairs of isolates.

amilesj avatar Sep 10 '21 14:09 amilesj

Hi @amilesj I have the same issue. Were you able to find a way around getting only unique pair combinations in the molten output?

idolawoye avatar May 26 '23 22:05 idolawoye

I write a python script, maybe you can try.

import argparse
from os import sep
import pandas as pd

def process_molten_file(molten_file, output_file):
    # Read the molten file into a DataFrame
    df = pd.read_csv(molten_file, sep = "\t", header=None)
    df.columns = ["Sample", "Pair", "Value"]
    # Ensure that the Pair column contains unique pairs
    df['Pair2'] = df.apply(lambda row: tuple(sorted([row['Sample'], row['Pair']])), axis=1)

    # Sort the DataFrame based on 'Value' column in descending order
    df = df.sort_values(by='Value', ascending=False)

    # Drop duplicates based on 'Pair' column, keeping the row with the maximum 'Value'
    unique_pairs_df = df.drop_duplicates(subset='Pair2', keep='first')

    # If you want to reset the index of the resulting DataFrame:
    unique_pairs_df = unique_pairs_df.reset_index(drop=True)

    # Save the output DataFrame to a TSV file
    unique_pairs_df.to_csv(output_file, sep='\t', index=False)
    print(f"Contents of the reduced molten file are saved to: {output_file}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Reduce molten output to unique pairs and keep max value.")
    parser.add_argument("molten_file", help="Path to the molten output file to be processed")
    parser.add_argument("output_file", help="Path to save the output in TSV format")

    args = parser.parse_args()
    molten_file_path = args.molten_file
    output_file_path = args.output_file

    process_molten_file(molten_file_path, output_file_path)

slbai01 avatar Jul 20 '23 11:07 slbai01