skip to Main Content

creating a resultant DataFrame that combines information from both sources while filling missing values in ‘Product Strings’ with corresponding values from ‘df2’.

import pandas as pd

# DataFrame 1
data1 = {'Company ID': [1, 2, 3],
         'Product ID': ['D1', 'D2', 'D3'],
         'SomeColumn1': ['A', 'B', 'C'],
         'SomeColumn2': ['L', 'M', 'N']}
df1 = pd.DataFrame(data1)

# DataFrame 2
data2 = {'Company ID': [2, 3, 4, 3],
         'Product ID': ['D4', 'D5', 'D6', 'D8'],
         'Company String': ['Prod X', 'Prod Y', 'Prod Z', 'Prod Y'],
         'SomeColumn1': ['W', 'X', 'Y', 'Y1']}
df2 = pd.DataFrame(data2)

# DataFrame 3 (Resultant merged DataFrame)
merged_data = {'Company ID': [1, 2, 3, 2, 3, 4, 3],
               'Product ID': ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8'],
               'SomeColumn1': ['A', 'B', 'C', 'W', 'X', 'Y', 'Y1'],
               'SomeColumn2': ['L', 'M', 'N', None, None, None, None],
               'Company String': [None, 'Prod X', 'Prod Y', 'Prod X', 'Prod Y', 'Prod Z', 'Prod Y']}

This is what I could do but is not producing what I intend

def merge_dataframes(df1, df2):
    # Find the common columns except 'Company ID' and 'Product ID'
    common_columns = set(df1.columns) & set(df2.columns)
    common_columns.discard('Company ID')
    common_columns.discard('Product ID')

    # Merge the dataframes on 'Company ID'
    merged_df = pd.merge(df1, df2, on='Company ID', how='outer', suffixes=('', '_df2'))

# Iterate over common columns and copy missing values
for column in common_columns:
    merged_df[column].fillna(merged_df[f'{column}_df2'], inplace=True)
    merged_df.drop(columns=[f'{column}_df2'], inplace=True)

# Concatenate 'Deal ID' while preserving 'Deal ID' from df1
merged_df['Product ID'] = merged_df.apply(lambda row: row['Product ID_df2'] if pd.notna(row['Product ID_df2']) else row['Deal ID'], axis=1)
merged_df.drop(columns=['Product ID_df2'], inplace=True)

return merged_df

Merge the dataframes using the merge_dataframes function

merged_df = merge_dataframes(df1, df2)
print(merged_df)

The results I got was

data = {'Company ID': [1, 2, 3, 3, 4],
        'Product ID': ['D1', 'D4', 'D5', 'D8', 'D6'],
        'SomeColumn1': ['A', 'B', 'C', 'C', 'Y'],
        'SomeColumn2': ['L', 'M', 'N', 'N', None],
        'Company Strings': [None, 'Prod X', 'Prod Y', 'Prod Y', 'Prod Z']}

I am also okay with trying this in SQL

Appreciate it

2

Answers


  1. You can combine the dataframes in this way with pd.concat. This is similar to a UNION ALL in sql:

    merged_df = pd.concat([df1, df2])
    

    And that gives

    print(merged_df)
       Company ID Product ID SomeColumn1 SomeColumn2 Company String
    0           1         D1           A           L            NaN
    1           2         D2           B           M            NaN
    2           3         D3           C           N            NaN
    0           2         D4           W         NaN         Prod X
    1           3         D5           X         NaN         Prod Y
    2           4         D6           Y         NaN         Prod Z
    3           3         D8          Y1         NaN         Prod Y
    

    To fill in the missing Company Strings, we first create an ID:string lookup based on df2, then apply it to the ID column using .map, and use that to fill in the missing values.

    company_lookup = df2.set_index('Company ID')['Company String'].drop_duplicates()
    merged_df['Company String'] = (merged_df['Company String']
                                   .fillna(merged_df['Company ID'].map(company_lookup)))
    
    Login or Signup to reply.
  2. In SQL, you can try this:

    data1:
    'Company ID' 'Product ID'  SomeColumn1 SomeColumn2
      1            'D1'          'A'         'L'
      2            'D2'          'B'         'M'
      3            'D3'          'C'         'N'
    
    data2:
    'Company ID' 'Product ID'  'Company String' SomeColumn1
      2            'D4'          'Prod X'         'W'
      3            'D5'          'Prod Y'         'X'
      4            'D6'          'Prod Z'         'Y'
      3            'D8'          'Prod Y'         'Y1'
    
    select COMPANY_ID
    , PRODUCT_ID
    , SOMECOLUMN1
    , NULL AS COMPANY_STRING
    , SOMECOLUMN2
    from data1
    
    UNION ALL
    
    select COMPANY_ID
    , PRODUCT_ID
    , NULL AS SOMECOLUMN1
    , COMPANY_STRING
    , SOMECOLUMN1
    from data2;
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search