def create_expression_matrix(raw_data_path: str, processed_data_path: str) -> None:
"""Create the expression matrices for all species.
Args:
raw_data_path (str): Path to the folder containing raw quant files.
processed_data_path (str): Path to store the processed expression matrix csv files.
Returns:
None: This function does not return a value but outputs files to the specified directory.
"""
if not os.path.isdir(raw_data_path):
print(f"The provided path {raw_data_path} is not a directory.")
return
for species in os.listdir(raw_data_path):
raw_csv_data_path = os.path.join(raw_data_path, species, "csv_files")
if not os.path.isdir(raw_csv_data_path):
continue
if not os.listdir(raw_csv_data_path):
continue
abundance_mat = pd.DataFrame()
length_mat = pd.DataFrame()
counts_mat = pd.DataFrame()
for quant_file in os.listdir(raw_csv_data_path):
file_path = os.path.join(raw_csv_data_path, quant_file)
abundance_df = pd.read_csv(file_path, usecols=["Name", "TPM"]).set_index(
"Name"
)
length_df = pd.read_csv(
file_path, usecols=["Name", "EffectiveLength"]
).set_index("Name")
counts_df = pd.read_csv(file_path, usecols=["Name", "NumReads"]).set_index(
"Name"
)
run_id = quant_file.split("_")[1][:-4]
abundance_df.rename(columns={"TPM": run_id}, inplace=True)
length_df.rename(columns={"EffectiveLength": run_id}, inplace=True)
counts_df.rename(columns={"NumReads": run_id}, inplace=True)
abundance_df = abundance_df[~abundance_df.index.duplicated(keep="first")]
length_df = length_df[~length_df.index.duplicated(keep="first")]
counts_df = counts_df[~counts_df.index.duplicated(keep="first")]
abundance_mat = pd.concat([abundance_mat, abundance_df], axis=1, sort=False)
length_mat = pd.concat([length_mat, length_df], axis=1, sort=False)
counts_mat = pd.concat([counts_mat, counts_df], axis=1, sort=False)
length_scaled_tpm_mat = get_length_scaled_tpm_matrix(
counts_mat, abundance_mat, length_mat
)
expression_matrix_path = os.path.join(processed_data_path, f"{species}.csv")
length_scaled_tpm_mat.to_csv(expression_matrix_path)
print(f"\nExpression matrix for {species} created successfully.")