I have create this File Watcher Loop, when I run the code, it scans a specific folder for .json files and append to an ‘output.xls’ file. Then the code continues to run in a loop, scanning new files in the folder, and repeat the process. This works just fine, however, when I stop the code (laptop turn-off, or something), new files are still being added to the folder, and then when I re-run the code, I cannot continue where I left off, I have to delete the output.xls file and start over again.
Is there a way for this to save the history of the files already appended when I stop the code, and continue adding files that have not been appended when I re-run the code?
import os
import glob
import json
import pandas as pd
import time
from datetime import datetime
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
def jsonFilesInDirectory(my_dir: str):
# Get a list of JSON files in the directory
json_files = glob.glob(os.path.join(my_dir, "*.json"))
return json_files
def clean_value(value):
# Clean up numeric values by removing unnecessary characters
return float(value.replace('xa0s', '').replace('xa0ms', '').replace(',', ''))
def doThingsWithNewFiles(fileDiff: list, my_dir: str, workbook):
for file_name in fileDiff:
file_path = os.path.join(my_dir, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
try:
json_data = json.load(file)
# Extract data from the JSON file
url = json_data["finalUrl"]
fetch_time = json_data["fetchTime"]
audits = json_data["audits"]
fcp_metric = audits["first-contentful-paint"]["id"]
fcp_value = audits["first-contentful-paint"]["displayValue"]
fcp_score = audits["first-contentful-paint"]["score"]
lcp_metric = audits["largest-contentful-paint"]["id"]
lcp_value = audits["largest-contentful-paint"]["displayValue"]
lcp_score = audits["largest-contentful-paint"]["score"]
fmp_metric = audits["first-meaningful-paint"]["id"]
fmp_value = audits["first-meaningful-paint"]["displayValue"]
fmp_score = audits["first-meaningful-paint"]["score"]
si_metric = audits["speed-index"]["id"]
si_value = audits["speed-index"]["displayValue"]
si_score = audits["speed-index"]["score"]
tbt_metric = audits["total-blocking-time"]["id"]
tbt_value = audits["total-blocking-time"]["displayValue"]
tbt_score = audits["total-blocking-time"]["score"]
cls_metric = audits["cumulative-layout-shift"]["id"]
cls_value = audits["cumulative-layout-shift"]["displayValue"]
cls_score = audits["cumulative-layout-shift"]["score"]
categories = json_data["categories"]
perf_metric = categories["performance"]["id"]
perf_value = 0
perf_score = categories["performance"]["score"]
# Clean up values and format the fetch time
cleaned_fcp_value = clean_value(fcp_value)
cleaned_lcp_value = clean_value(lcp_value)
cleaned_fmp_value = clean_value(fmp_value)
cleaned_si_value = clean_value(si_value)
cleaned_tbt_value = clean_value(tbt_value)
datetime_obj = datetime.strptime(fetch_time, "%Y-%m-%dT%H:%M:%S.%fZ")
cleaned_fetch_time = datetime_obj.strftime("%Y-%m-%d %H:%M:%S")
# Create a data dictionary for the DataFrame
data_dict = {
"fetch_time": [cleaned_fetch_time] * 7,
"url": [url] * 7,
"metric": ["performance","first_contentful_paint", "largest_contentful_paint",
"first-meaningful-paint", "speed-index", "total-blocking-time",
"cumulative-layout-shift"],
"value": [perf_value, cleaned_fcp_value, cleaned_lcp_value,
cleaned_fmp_value, cleaned_si_value, cleaned_tbt_value,
cls_value],
"score": [perf_score, fcp_score, lcp_score, fmp_score, si_score, tbt_score, cls_score]
}
df = pd.DataFrame(data_dict)
# Append the DataFrame to the Excel file
sheet_name = "Sheet1"
if sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
startrow = sheet.max_row
for row in dataframe_to_rows(df, index=False, header=False):
sheet.append(row)
else:
sheet = workbook.create_sheet(sheet_name)
for row in dataframe_to_rows(df, index=False, header=True):
sheet.append(row)
print(f"Data extracted from {file_name} and appended to the Excel file")
except KeyError as e:
print(f"KeyError occurred while processing file '{file_name}': {e}")
except json.JSONDecodeError as e:
print(f"JSONDecodeError occurred while processing file '{file_name}': {e}")
except Exception as e:
print(f"An error occurred while processing file '{file_name}': {e}")
def fileWatcher(my_dir: str, pollTime: int):
excel_file_path = os.path.join(my_dir, 'output.xlsx')
existingFiles = []
# Check if the output file already exists
if os.path.isfile(excel_file_path):
try:
workbook = openpyxl.load_workbook(excel_file_path)
existingFiles = jsonFilesInDirectory(my_dir)
# Process the existing JSON files and append data to the Excel file
doThingsWithNewFiles(existingFiles, my_dir, workbook)
print("Existing JSON files processed and data appended to the Excel file")
except openpyxl.utils.exceptions.InvalidFileException:
workbook = openpyxl.Workbook()
else:
workbook = openpyxl.Workbook()
# Check for new files at startup
newFileList = jsonFilesInDirectory(my_dir)
fileDiff = listComparison(existingFiles, newFileList)
existingFiles = newFileList
if len(fileDiff) > 0:
# Process the new files and append data to the Excel file
doThingsWithNewFiles(fileDiff, my_dir, workbook)
# Save the Excel file
workbook.save(excel_file_path)
print(f"DataFrame exported to {excel_file_path}")
while True:
time.sleep(pollTime)
# Get the updated list of JSON files in the directory
newFileList = jsonFilesInDirectory(my_dir)
# Find the difference between the previous and new file lists
fileDiff = listComparison(existingFiles, newFileList)
existingFiles = newFileList
if len(fileDiff) > 0:
# Process the new files and append data to the Excel file
doThingsWithNewFiles(fileDiff, my_dir, workbook)
# Save the Excel file
workbook.save(excel_file_path)
print(f"DataFrame exported to {excel_file_path}")
def listComparison(originalList: list, newList: list):
# Compare two lists and return the differences
differencesList = [x for x in newList if x not in originalList]
return differencesList
my_dir = r"Z:"
pollTime = 60
fileWatcher(my_dir, pollTime)
2
Answers
The simplest idea: To get the list of files changed since your last update to the Excel file, use
os.path.getmtime
to get the time of the last change of the Excel file and of all the JSON files, and select those JSON files that are newer. Do this at startup if the Excel file exists, and process each of the selected JSON files as if they were detected by the watcher.However this could introduce some ambiguity about the files that are processed very near the power loss. So instead, the more accurate idea: save the list of processed JSON files, whether inside the Excel file, or in a separate place (e.g. another file, or a database).
An even more refined idea is to use a database where you save the data keyed to the JSON file, using the database as the single source of truth, and generate the Excel file from the database as needed.
As an aside, overwriting the Excel file is a possible point of failure. A good practice to do in this situation is to write to a temporary file in the same directory, then perform
os.rename
, which will atomically replace the old file with the new one.You can create a text file that has the list of scanned files stored.
Updated your code, to read if exists and write the text file.
Couldn’t test the code, let me know if there’s any issue with this.