creating a resultant DataFrame that combines information from both sources while filling missing values in ‘Product Strings’ with corresponding values from ‘df2’.
import pandas as pd
# DataFrame 1
data1 = {'Company ID': [1, 2, 3],
'Product ID': ['D1', 'D2', 'D3'],
'SomeColumn1': ['A', 'B', 'C'],
'SomeColumn2': ['L', 'M', 'N']}
df1 = pd.DataFrame(data1)
# DataFrame 2
data2 = {'Company ID': [2, 3, 4, 3],
'Product ID': ['D4', 'D5', 'D6', 'D8'],
'Company String': ['Prod X', 'Prod Y', 'Prod Z', 'Prod Y'],
'SomeColumn1': ['W', 'X', 'Y', 'Y1']}
df2 = pd.DataFrame(data2)
# DataFrame 3 (Resultant merged DataFrame)
merged_data = {'Company ID': [1, 2, 3, 2, 3, 4, 3],
'Product ID': ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8'],
'SomeColumn1': ['A', 'B', 'C', 'W', 'X', 'Y', 'Y1'],
'SomeColumn2': ['L', 'M', 'N', None, None, None, None],
'Company String': [None, 'Prod X', 'Prod Y', 'Prod X', 'Prod Y', 'Prod Z', 'Prod Y']}
This is what I could do but is not producing what I intend
def merge_dataframes(df1, df2):
# Find the common columns except 'Company ID' and 'Product ID'
common_columns = set(df1.columns) & set(df2.columns)
common_columns.discard('Company ID')
common_columns.discard('Product ID')
# Merge the dataframes on 'Company ID'
merged_df = pd.merge(df1, df2, on='Company ID', how='outer', suffixes=('', '_df2'))
# Iterate over common columns and copy missing values
for column in common_columns:
merged_df[column].fillna(merged_df[f'{column}_df2'], inplace=True)
merged_df.drop(columns=[f'{column}_df2'], inplace=True)
# Concatenate 'Deal ID' while preserving 'Deal ID' from df1
merged_df['Product ID'] = merged_df.apply(lambda row: row['Product ID_df2'] if pd.notna(row['Product ID_df2']) else row['Deal ID'], axis=1)
merged_df.drop(columns=['Product ID_df2'], inplace=True)
return merged_df
Merge the dataframes using the merge_dataframes function
merged_df = merge_dataframes(df1, df2)
print(merged_df)
The results I got was
data = {'Company ID': [1, 2, 3, 3, 4],
'Product ID': ['D1', 'D4', 'D5', 'D8', 'D6'],
'SomeColumn1': ['A', 'B', 'C', 'C', 'Y'],
'SomeColumn2': ['L', 'M', 'N', 'N', None],
'Company Strings': [None, 'Prod X', 'Prod Y', 'Prod Y', 'Prod Z']}
I am also okay with trying this in SQL
Appreciate it
2
Answers
You can combine the dataframes in this way with
pd.concat
. This is similar to aUNION ALL
in sql:And that gives
To fill in the missing Company Strings, we first create an ID:string lookup based on df2, then apply it to the ID column using
.map
, and use that to fill in the missing values.In SQL, you can try this: