How do I extract/view the csv files created by this block of code? What the code does is pull data from a table and extract the json elements. I am running this code on Jupyter notebook but I can’t figure out how to view the csv file.
print('Conducting ETL for all events data')
class CsvBuilder:
def __init__(self, fileName: str):
self._fileName = fileName
self._headers = []
self._rows = []
def hasHeaders(self) -> bool:
return len(self._headers) > 0
def setHeaders(self, headers):
self._headers = headers
def addRow(self, row):
self._rows.append(row)
def write(self):
file_existed = os.path.isfile(self._fileName)
with open(self._fileName, 'a') as outputFile:
writer = csv.writer(outputFile)
if not file_existed:
writer.writerow(self._headers)
writer.writerows(self._rows)
class NullBuilder(CsvBuilder):
def __init__(self):
super().__init__(None)
def hasHeaders(self) -> bool:
pass
def setHeaders(self, headers):
pass
def addRow(self, row):
pass
def write(self):
pass
class CsvBuilderProvider:
def __init__(self):
self._builders = {}
self._defaultBuilder = NullBuilder()
def registerBuilder(self, event: str, builder: CsvBuilder) -> 'CsvBuilderProvider':
self._builders[event] = builder
return self
def getBuilderForEvent(self, event: str):
if event in self._builders.keys():
return self._builders[event]
return self._defaultBuilder
def isJson(value) :
return str(value)[0] == '{'
for start_date in weeks:
end_date = start_date + dt.timedelta(days=7)
print('Querying for date range: {} to {}'.format(start_date, end_date))
query_data = """
SELECT events
FROM webschema.data
WHERE created_at >= DATE('{start_date}')
AND created_at < DATE('{end_date}')
;
""".format(start_date=start_date, end_date=end_date)
cursor.execute(query_data)
rows = cursor.fetchall()
print('Query completed')
csvBuilderProvider = CsvBuilderProvider()
csvBuilderProvider.registerBuilder('open', CsvBuilder('open.csv'))
.registerBuilder('send', CsvBuilder('send.csv'))
.registerBuilder('click', CsvBuilder('click.csv'))
.registerBuilder('soft_bounce', CsvBuilder('soft_bounce.csv'))
.registerBuilder('hard_bounce', CsvBuilder('hard_bounce.csv'))
print('Processing data')
csvBuilders = {}
processed = 0
errored = 0
for row in rows:
try:
jsonData = json.loads(row[0])
for event in jsonData:
eventName = event["event"]
csvBuilder = csvBuilders.setdefault(eventName, csvBuilderProvider.getBuilderForEvent(eventName))
if not csvBuilder.hasHeaders():
csvBuilder.setHeaders(event.keys())
newData = []
for value in event.values():
quotedValue = json.dumps(value) if isJson(value) else value
newData.append(quotedValue)
csvBuilder.addRow(newData)
processed += 1
except Exception as e:
errored += 1
print('Appending processed data to CSV')
for builder in csvBuilders.values():
builder.write()
I don’t have a clue how to extract the csv files. It says file not found when I try to run
pd.read_csv('open.csv')
3
Answers
You can add a current working directory to your paths, pathlib module is a good choice for that:
Lambda stores files locally on the /tmp folder.
So in order to open the file, you would need to do something like this:
Doesn’t this work?
PS: Your code sample has incorrect indentation, also
NullBuilder
is poorly written, it initialises with an invalid file name, so the least it should do is raise an error (e.g.RuntimeError
) whenwrite()
is called.