skip to Main Content

How do I extract/view the csv files created by this block of code? What the code does is pull data from a table and extract the json elements. I am running this code on Jupyter notebook but I can’t figure out how to view the csv file.

print('Conducting ETL for all events data')

class CsvBuilder:
    def __init__(self, fileName: str):
        self._fileName = fileName
        self._headers = []
        self._rows = []

    def hasHeaders(self) -> bool:
        return len(self._headers) > 0

    def setHeaders(self, headers):
        self._headers = headers

    def addRow(self, row):
        self._rows.append(row)

    def write(self):
        file_existed = os.path.isfile(self._fileName)
        with open(self._fileName, 'a') as outputFile:
            writer = csv.writer(outputFile)
            if not file_existed:
                writer.writerow(self._headers)
            writer.writerows(self._rows)


class NullBuilder(CsvBuilder):
    def __init__(self):
        super().__init__(None)

    def hasHeaders(self) -> bool:
        pass

    def setHeaders(self, headers):
        pass

    def addRow(self, row):
        pass

    def write(self):
        pass


class CsvBuilderProvider: 
    def __init__(self):
        self._builders = {}
        self._defaultBuilder = NullBuilder()

    def registerBuilder(self, event: str, builder: CsvBuilder) -> 'CsvBuilderProvider':
        self._builders[event] = builder 
        return self

def getBuilderForEvent(self, event: str):
        if event in self._builders.keys():
            return self._builders[event]
        return self._defaultBuilder


def isJson(value) :
    return str(value)[0] == '{'

for start_date in weeks:
    end_date = start_date + dt.timedelta(days=7)

    print('Querying for date range: {} to {}'.format(start_date, end_date))

    query_data = """
        SELECT events
        FROM webschema.data
        WHERE created_at >= DATE('{start_date}')
        AND created_at < DATE('{end_date}')
        ;
    """.format(start_date=start_date, end_date=end_date)

    cursor.execute(query_data)

    rows = cursor.fetchall()

    print('Query completed')

    csvBuilderProvider = CsvBuilderProvider()

    csvBuilderProvider.registerBuilder('open', CsvBuilder('open.csv')) 
        .registerBuilder('send', CsvBuilder('send.csv')) 
        .registerBuilder('click', CsvBuilder('click.csv')) 
        .registerBuilder('soft_bounce', CsvBuilder('soft_bounce.csv')) 
        .registerBuilder('hard_bounce', CsvBuilder('hard_bounce.csv'))

    print('Processing data')

    csvBuilders = {}
    processed = 0
    errored = 0
    for row in rows:
        try:
            jsonData = json.loads(row[0])
            for event in jsonData:
                eventName = event["event"]
                csvBuilder = csvBuilders.setdefault(eventName, csvBuilderProvider.getBuilderForEvent(eventName))
                if not csvBuilder.hasHeaders():
                    csvBuilder.setHeaders(event.keys())
                newData = []
                for value in event.values():
                    quotedValue = json.dumps(value) if isJson(value) else value
                    newData.append(quotedValue)
                csvBuilder.addRow(newData)
                processed += 1
        except Exception as e:
            errored += 1

    print('Appending processed data to CSV')
    for builder in csvBuilders.values():
        builder.write()

I don’t have a clue how to extract the csv files. It says file not found when I try to run

pd.read_csv('open.csv')

3

Answers


  1. You can add a current working directory to your paths, pathlib module is a good choice for that:

    import pathlib
    CWD = pathlib.Path().resolve()
    ...
    CsvBuilder(CWD / 'open.csv')
    ...
    pd.read_csv(CWD / 'open.csv')
    
    
    Login or Signup to reply.
  2. Lambda stores files locally on the /tmp folder.

    So in order to open the file, you would need to do something like this:

    import os
    os.chdir('/tmp')
    pd.read_csv('open.csv')
    
    Login or Signup to reply.
  3. Doesn’t this work?

    pd.read_csv(csvBuilderProvider.getBuilderForEvent("open")._fileName)
    

    PS: Your code sample has incorrect indentation, also NullBuilder is poorly written, it initialises with an invalid file name, so the least it should do is raise an error (e.g. RuntimeError) when write() is called.

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search