I am currently developing a Python script to retrieve a comprehensive list of all the jobs that were executed yesterday. However, I’m encountering an issue with the script’s pagination mechanism using tokens. Despite my attempts to loop through the pagination process, the resulting output remains unchanged.
Here is the code
import requests
import pandas as pd
import math
import datetime
import json
def fetch_and_process_job_runs(base_uri, api_token, params):
endpoint = '/api/2.1/jobs/runs/list'
headers = {'Authorization': f'Bearer {api_token}'}
all_data = [] # To store all the data from multiple pages
while True:
# print(params)
response = requests.get(base_uri + endpoint, headers=headers, params=params)
response_json = response.json()
data = []
for run in response_json["runs"]:
start_time_ms = run["start_time"]
start_time_seconds = start_time_ms / 1000
start_time_readable = datetime.datetime.fromtimestamp(start_time_seconds).strftime('%Y-%m-%d %H:%M:%S')
data.append({
"job_id": run["job_id"],
"creator_user_name": run["creator_user_name"],
"run_name": run["run_name"],
"run_page_url": run["run_page_url"],
"run_id": run["run_id"],
"execution_duration_in_mins": math.ceil(int(run.get('execution_duration')) / (1000 * 60)),
"result_state": run["state"].get("result_state"),
"start_time": start_time_readable
})
all_data.extend(data)
df = pd.DataFrame(all_data)
print(df)
if response_json.get("has_more") == True:
next_page_token = response_json.get("next_page_token")
params['next_page_token'] = next_page_token
else:
break
df = pd.DataFrame(all_data)
return df
# Replace with your actual values
now = datetime.datetime.utcnow()
yesterday = now - datetime.timedelta(days=1)
start_time_from = int(yesterday.replace(hour=0, minute=0, second=0, microsecond=0).timestamp()) * 1000
start_time_to = int(yesterday.replace(hour=23, minute=59, second=59, microsecond=999999).timestamp()) * 1000
params = {
# "start_time_from": start_time_from,
# "start_time_to": start_time_to,
"expand_tasks": True
}
baseURI = 'https://adb-xxxxxxxxxxxxxx.azuredatabricks.net'
apiToken = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
result_df = fetch_and_process_job_runs(baseURI, apiToken, params)
print(result_df)
Please help me.
2
Answers
I noticed that the value of next_token wasn’t changing in API response and then figured that you have a very small error in your code. The parameter to be passed in request is
page_token
and notnext_page_token
.As per documentation at https://docs.databricks.com/api/workspace/jobs/list,
page_token string
Use
next_page_token
orprev_page_token
returned from the previous request to list the next or previous page of jobs respectively.So
params['next_page_token']
needs to change to
params['page_token']
The best is to use Databricks SDK instead – it will hide API details, and if changes in the future, then your code won’t require changes. It’s simple as:
Plus it automatically works with different authentication methods, etc. across different tools.