Filter Tapis Jobs

Filter Tapis Jobs#

by Silvia Mazzoni, DesignSafe, 2025

Let’s create a function that finds a specific job, or set of jobs, that meets certain key criterias – independently of whether you created a job via the web-portal app, or directly via Tapis in a Jupyter Notebook.

Workflow#

Connect to Tapis
Get a dataframe containing all Job Metadata.
We will then query the metadata (pandas dataframe) to find an individual job and then get basic info on that job.

We are going to write a python function so that we may use it with different inputs.

Select a Job Based on an Any Key#

filter the jobs dataframe
you can have a single key, or many.
you can search for one job, or many.

Using local utilities library

Connect to Tapis#

connect_tapis.py

# ../OpsUtils/OpsUtils/Tapis/connect_tapis.py
def connect_tapis(token_filePath: str = "~/.tapis_tokens.json",
                  base_url: str = "https://designsafe.tapis.io",
                  username: str = "",
                  password: str = "",
                  force_connect: bool = False):
    """
    Connect to a Tapis platform (e.g., DesignSafe) with automatic token handling.

    Behavior
    --------
    - Looks for a saved access token at `token_filePath` (default: ~/.tapis_tokens.json).
    - If present and not expired, uses it to create an authenticated Tapis client.
    - If missing/expired, or when `force_connect=True`, prompts for credentials,
      requests new tokens, and saves them back to `token_filePath`.
    - Prints expiration details for transparency.

    Parameters
    ----------
    token_filePath : str, default "~/.tapis_tokens.json"
        Path to the JSON file that stores the Tapis `access_token` and `expires_at`.
    base_url : str, default "https://designsafe.tapis.io"
        Tapis API endpoint base URL.
    username : str, default ""
        Optional preset username. If empty, you will be prompted.
    password : str, default ""
        Optional preset password. If empty, you will be prompted (securely).
    force_connect : bool, default False
        If True, ignores any valid saved token and performs a fresh login.

    Returns
    -------
    object
        An authenticated `Tapis` client object ready to use.

    Notes
    -----
    - The token file stores: `{"access_token": "...", "expires_at": "...ISO8601..."}`.
    - Expiry timestamps are treated as UTC if no timezone is present.
    - If the saved token cannot be parsed/validated, a fresh login is performed.

    Example
    -------
    t = connect_tapis()                        # use saved token or prompt as needed
    jobs = t.jobs.getJobList()                 # now you're authenticated

    Author
    ------
    Silvia Mazzoni, DesignSafe (silviamazzoni@yahoo.com)

    Date
    ----
    2025-08-14

    Version
    -------
    1.0
    """
    from tapipy.tapis import Tapis
    from getpass import getpass
    from datetime import datetime, timezone
    import json
    import os

    def _parse_expires_at(s: str) -> datetime | None:
        """Parse ISO8601 expiry, accepting 'Z' and naive strings; return aware UTC dt or None."""
        if not s:
            return None
        try:
            # normalize trailing 'Z' to +00:00
            s_norm = s.replace("Z", "+00:00")
            dt = datetime.fromisoformat(s_norm)
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=timezone.utc)
            return dt.astimezone(timezone.utc)
        except Exception:
            return None

    def getTokensLoop():
        username = getpass("Username: ")
        password = getpass("Password: ")
        t = Tapis(base_url=base_url, username=username, password=password)
        try:
            t.get_tokens()
            return t
        except Exception as e:
            print(f" ** Warning ** could get token : {e},\n TRY AGAIN!")
            t= getTokensLoop()
            return t

        
    print(" -- Checking Tapis token --")
    token_path = os.path.expanduser(token_filePath)
    now = datetime.now(timezone.utc)

    t = None
    saved_expires_at = None
    valid_token = False

    # Try to load a saved token
    if os.path.exists(token_path):
        try:
            with open(token_path, "r") as f:
                tokens = json.load(f)
            saved_expires_at = _parse_expires_at(tokens.get("expires_at"))
            if tokens.get("access_token") and saved_expires_at and saved_expires_at > now:
                print(" Token loaded from file. Token is still valid!")
                t = Tapis(base_url=base_url, access_token=tokens["access_token"])
                valid_token = True
            else:
                print(" Token file found but token is missing/expired.")
                if saved_expires_at:
                    print(" Token expired at:", saved_expires_at.isoformat())
        except Exception as e:
            print(f" Could not read/parse token file ({token_path}): {e}")
    else:
        print(" No saved tokens found.")

    if force_connect:
        print(" Forcing a connection to Tapis (fresh login).")

    if not valid_token or force_connect:
        print("-- Connect to Tapis --")
        if not username:
            # username isn't sensitive; echoing can help avoid typos, but keeping your original choice:
            username = getpass("Username: ")
        if not password:
            password = getpass("Password: ")
        t = Tapis(base_url=base_url, username=username, password=password)
        try:
            t.get_tokens()
        except Exception as e:
            print(f" ** Warning ** could get token : {e},\n TRY AGAIN!")
            t= getTokensLoop()
        # Save the new token back to the chosen path
        try:
            tokens = {
                "access_token": t.access_token.access_token,
                "expires_at": t.access_token.expires_at.isoformat(),
            }
            os.makedirs(os.path.dirname(token_path), exist_ok=True)
            with open(token_path, "w") as f:
                json.dump(tokens, f)
            print(f" Token saved to {token_path}")
            saved_expires_at = _parse_expires_at(tokens["expires_at"])
        except Exception as e:
            print(f" Warning: could not save token to {token_path}: {e}")

    # Print expiry info (use stored/parsed date if needed)
    exp_to_show = saved_expires_at
    try:
        # if available, prefer the client object's value
        if getattr(t, "access_token", None) and getattr(t.access_token, "expires_at", None):
            exp_to_show = _parse_expires_at(str(t.access_token.expires_at)) or exp_to_show
    except Exception:
        pass

    if exp_to_show:
        print(" Token expires at:", exp_to_show.isoformat())
        print(" Token expires in:", str(exp_to_show - now))
    else:
        print(" Token expiry time unavailable.")

    print("-- LOG IN SUCCESSFUL! --")
    return t

t=OpsUtils.connect_tapis()

 -- Checking Tapis token --
 Token loaded from file. Token is still valid!
 Token expires at: 2025-08-21T02:49:32+00:00
 Token expires in: 3:39:59.079504
-- LOG IN SUCCESSFUL! --

Get All-Jobs Metadata as a dataframe#

it’s best to just get the data and then filter on that dataframe.

get_tapis_jobs_df.py

# ../OpsUtils/OpsUtils/Tapis/get_tapis_jobs_df.py
def get_tapis_jobs_df(t, displayIt=False, NmaxJobs=500):
    """
    Retrieve a list of jobs from Tapis and organize them into a Pandas DataFrame.

    This function fetches up to NmaxJobs from the user's Tapis account, converts the 
    results into a structured DataFrame, adds a convenient index column, and moves key 
    metadata columns (like name, uuid, status) to the front for easier exploration.

    It can also optionally display the DataFrame (entire or just the head) right in 
    the notebook for quick inspection.

    Parameters
    ----------
    t : Tapis
        An authenticated Tapis client (from connect_tapis()).

    displayIt : bool or str, default=False
        If 'head' or 'displayHead', displays only the first few rows.
        If True or 'displayAll', displays the entire DataFrame.
        If False, no display output (just returns the DataFrame).

    NmaxJobs : int, default=500
        Maximum number of jobs to retrieve from Tapis.

    Returns
    -------
    pandas.DataFrame
        DataFrame containing metadata for the fetched jobs.

    Example
    -------
    df = get_tapis_jobs_df(t, displayIt='head', NmaxJobs=1000)
    """
    # Silvia Mazzoni, 2025

    from datetime import datetime, timezone
    import pandas as pd

    # Get jobs from Tapis
    jobslist = t.jobs.getJobList(limit=NmaxJobs)
    
    # Convert TapisResult objects to dictionaries
    jobsdicts = [job.__dict__ for job in jobslist]
    
    # Build DataFrame
    df = pd.DataFrame(jobsdicts)
    
    # Add index column for convenience
    df["index_column"] = df.index
    
    # add formatted data
    for thisK in ['created','remoteStarted', 'ended','lastUpdated']:
        df[f'{thisK}_dt'] = pd.to_datetime(df[thisK], utc=True)
        df[f'{thisK}_unix'] = df[f'{thisK}_dt'].astype('int64') // 10**9
        df[f'{thisK}_date'] = df[f'{thisK}_unix'].apply(
                    lambda x: datetime.fromtimestamp(x, tz=timezone.utc).date()
                )

    
    # Reorder columns: put key ones first if they exist
    startCols = ['index_column', 'name', 'uuid', 'status', 'appId', 'appVersion']
    existingStartCols = [col for col in startCols if col in df.columns]
    remainingCols = [col for col in df.columns if col not in existingStartCols]
    columns = existingStartCols + remainingCols
    df = df[columns]
    
    # Optional display logic
    if displayIt != False:
        print(f'Found {len(df)} jobs')
        
        if displayIt in [True] or displayIt.lower() in ['display','displayall','all']:
            display(df)
        elif displayIt.lower() in ['head', 'displayHead']:
            display(df.head())
    
    return df

Unfiltered_df = OpsUtils.get_tapis_jobs_df(t)

display(Unfiltered_df)

	index_column	name	uuid	status	appId	appVersion	owner	created	condition	remoteStarted	...	created_date	remoteStarted_dt	remoteStarted_unix	remoteStarted_date	ended_dt	ended_unix	ended_date	lastUpdated_dt	lastUpdated_unix	lastUpdated_date
0	0	opensees-mp-s3-3.6.0_2024-06-13T18:18:01	0d3c401b-1807-45a7-904c-ea50d381d2ca-007	FAILED	opensees-mp-s3	3.6.0	silvia	2024-06-20T21:29:15.384516Z	None	None	...	2024-06-20	NaT	-9223372037	1677-09-21	2024-06-20 21:40:18.629466+00:00	1718919618	2024-06-20	2024-06-20 21:40:18.629466+00:00	1718919618	2024-06-20
1	1	opensees-mp-s3-3.6.0_2024-06-13T18:18:01	a983892a-f8a7-45cd-91a9-fe87747bb49c-007	FAILED	opensees-mp-s3	3.6.0	silvia	2024-06-13T18:18:10.809303Z	None	None	...	2024-06-13	NaT	-9223372037	1677-09-21	2024-06-13 18:19:12.957455+00:00	1718302752	2024-06-13	2024-06-13 18:19:12.957455+00:00	1718302752	2024-06-13
2	2	opensees-interactive-3.7.0_2024-10-08T20:57:15	65e05dee-13d9-4b56-8d83-f9957991f28a-007	FAILED	opensees-interactive	3.7.0	silvia	2024-10-08T20:57:24.066497Z	None	2024-10-08T20:57:46.786802Z	...	2024-10-08	2024-10-08 20:57:46.786802+00:00	1728421066	2024-10-08	2024-10-08 21:36:31.742985+00:00	1728423391	2024-10-08	2024-10-08 21:36:31.742985+00:00	1728423391	2024-10-08
3	3	opensees-interactive-3.7.0_2024-08-11T02:15:19	d5ca8c92-6858-4c2d-84c8-de3a6891fab6-007	FAILED	opensees-interactive	3.7.0	silvia	2024-08-11T02:15:36.208122Z	None	2024-08-11T02:15:58.677513Z	...	2024-08-11	2024-08-11 02:15:58.677513+00:00	1723342558	2024-08-11	2024-08-12 02:18:52.616413+00:00	1723429132	2024-08-12	2024-08-12 02:18:52.627855+00:00	1723429132	2024-08-12
4	4	opensees-interactive-3.7.0_2024-08-14T01:48:24	aba14450-27bc-42ae-93f7-2330afdd6462-007	FAILED	opensees-interactive	3.7.0	silvia	2024-08-14T01:48:32.046028Z	None	2024-08-14T01:48:54.607485Z	...	2024-08-14	2024-08-14 01:48:54.607485+00:00	1723600134	2024-08-14	2024-08-14 03:29:59.323250+00:00	1723606199	2024-08-14	2024-08-14 03:29:59.333848+00:00	1723606199	2024-08-14
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
495	495	opensees-interactive-latest_2025-06-19T21:12:20	d04b3140-acde-4d64-a8c1-7d8c46f5b25c-007	FAILED	opensees-interactive	latest	silvia	2025-06-19T21:27:49.686635Z	None	2025-06-19T21:28:12.183247Z	...	2025-06-19	2025-06-19 21:28:12.183247+00:00	1750368492	2025-06-19	2025-06-19 21:29:01.579244+00:00	1750368541	2025-06-19	2025-06-19 21:29:01.579244+00:00	1750368541	2025-06-19
496	496	opensees-interactive-latest_2025-06-19T21:12:20	fb630e5c-69f1-4370-b49e-b01e9d3da673-007	FAILED	opensees-interactive	latest	silvia	2025-06-19T22:38:07.428425Z	None	2025-06-19T22:38:40.331526Z	...	2025-06-19	2025-06-19 22:38:40.331526+00:00	1750372720	2025-06-19	2025-06-19 22:39:08.866665+00:00	1750372748	2025-06-19	2025-06-19 22:39:08.866665+00:00	1750372748	2025-06-19
497	497	OpenSees_agnostic-app-test_zipFolderOut	967f77be-8108-4750-bafa-f1bf9ddc34d8-007	FINISHED	agnostic-app-test	0.0.21	silvia	2025-08-20T01:40:15.974840Z	None	2025-08-20T01:41:34.213701Z	...	2025-08-20	2025-08-20 01:41:34.213701+00:00	1755654094	2025-08-20	2025-08-20 01:42:28.713746+00:00	1755654148	2025-08-20	2025-08-20 01:42:28.713746+00:00	1755654148	2025-08-20
498	498	opensees-MP-multiMotion-dapi	8d9405ff-ffc0-46f3-b20a-930596ff4481-007	FAILED	opensees-mp-s3	latest	silvia	2025-06-09T23:56:14.759209Z	None	None	...	2025-06-09	NaT	-9223372037	1677-09-21	2025-06-09 23:57:25.251713+00:00	1749513445	2025-06-09	2025-06-09 23:57:25.251713+00:00	1749513445	2025-06-09
499	499	OpsTrain_JobSubmit_WebPortal	f3633ed7-116a-45ec-bb8f-100978f6e5b7-007	FINISHED	opensees-mp-s3	latest	silvia	2025-06-03T05:57:51.692995Z	None	2025-06-03T05:59:15.487231Z	...	2025-06-03	2025-06-03 05:59:15.487231+00:00	1748930355	2025-06-03	2025-06-03 06:02:53.709377+00:00	1748930573	2025-06-03	2025-06-03 06:02:53.709377+00:00	1748930573	2025-06-03

500 rows × 27 columns

Filter Job Metadata#

I have written a python function filter_tapis_jobs_df() that will filter the dataframe based on our inputs.

Filters a Tapis jobs dataframe (filtered_df) based on the SelectCriteria dict.
Handles:
- time fields (string ranges converted to Unix timestamps)
- direct lists of categories (isin)
- single values or dates
Returns:
- list of filtered uuids
- the filtered dataframe
Optionally displays the dataframe & uuid list.

filter_tapis_jobs_df.py

# ../OpsUtils/OpsUtils/Tapis/filter_tapis_jobs_df.py
def filter_tapis_jobs_df(SelectCriteria, filtered_df, displayIt=False):
    """
    Filter a Tapis jobs DataFrame based on a flexible SelectCriteria dictionary.

    Parameters
    ----------
    SelectCriteria : dict
        A dictionary where keys are column names (like 'status', 'appId', or time fields)
        and values are either:
            - A list of values for `isin` checks,
            - A two-element list for ranges (especially dates),
            - Or a single value for exact matching.

    filtered_df : pandas.DataFrame
        The DataFrame to filter, typically generated by get_tapis_jobs_df().
        Should include time columns like created_unix, created_dt, created_date.

    displayIt : bool, optional
        If True, prints a summary of how many jobs matched, their UUIDs,
        and displays the filtered DataFrame.

    Returns
    -------
    tuple
        (filtered_uuid, filtered_df)
        - filtered_uuid : list of uuids matching the filters
        - filtered_df : the filtered DataFrame

    Filtering Logic
    ---------------
    - Recognizes time fields: 'created', 'remoteStarted', 'ended', 'lastUpdated'
      and handles any of their suffixes (_unix, _dt, _date) intelligently.
    - If the SelectCriteria key is one of these, it supports:
        - Ranges: ['2024-08-01', '2024-08-31'] → filters on unix or datetime columns.
        - Single dates: '2024-08-15' → matches that specific day.
    - For all other keys:
        - Lists are used with isin().
        - Single values are checked with ==.

    Notes
    -----
    - Internally converts timestamp strings to Unix time to allow robust comparisons.
    - Handles missing or malformed timestamps gracefully by excluding them.
    - Will skip any keys not present in the DataFrame.
    """
    # Silvia Mazzoni, 2025
    from datetime import datetime, timezone
    import re
    
    def convert_time_unix(timestamp_str):
        try:
            if not timestamp_str:
                return -1
            ts_clean = timestamp_str.rstrip('Z')
            if 'T' in ts_clean:
                fmt = "%Y-%m-%dT%H:%M:%S.%f" if re.search(r'\.\d+', ts_clean) else "%Y-%m-%dT%H:%M:%S"
            else:
                fmt = "%Y-%m-%d"
            outTime = datetime.strptime(ts_clean, fmt)
            outTime = outTime.replace(tzinfo=timezone.utc)
            return outTime.timestamp()
        except Exception:
            return -1

    time_keys = ['created', 'remoteStarted', 'ended', 'lastUpdated']
    time_keys_unix = [f"{k}_unix" for k in time_keys]
    time_keys_dt = [f"{k}_dt" for k in time_keys]
    time_keys_date = [f"{k}_date" for k in time_keys]

    for key, values in SelectCriteria.items():
        if key not in filtered_df.columns:
            continue

        # Handle ranges
        if isinstance(values, list) and len(values) == 2:
            if key in time_keys or key in time_keys_unix:
                # convert date strings to unix
                min_time = convert_time_unix(values[0]) if key in time_keys else values[0]
                max_time = convert_time_unix(values[1]) if key in time_keys else values[1]
                filtered_df = filtered_df[
                    (filtered_df[f"{key}_unix"] >= min_time) & (filtered_df[f"{key}_unix"] <= max_time)
                ]
            elif key in time_keys_dt:
                filtered_df = filtered_df[
                    (filtered_df[key] >= values[0]) & (filtered_df[key] <= values[1])
                ]
            elif key in time_keys_date:
                filtered_df = filtered_df[
                    (filtered_df[key] >= values[0]) & (filtered_df[key] <= values[1])
                ]
            else:
                filtered_df = filtered_df[filtered_df[key].isin(values)]

        # Handle single date match on _date
        elif key in time_keys + time_keys_unix + time_keys_dt + time_keys_date:
            try:
                target_date = datetime.strptime(values, "%Y-%m-%d").date()
                for tk in time_keys:
                    if key.startswith(tk):
                        filtered_df = filtered_df[filtered_df[f"{tk}_date"] == target_date]
            except Exception:
                pass

        # Handle single exact value
        else:
            filtered_df = filtered_df[filtered_df[key] == values]

    filtered_uuid = list(filtered_df['uuid'])

    if displayIt:
        print(f'Found {len(filtered_df)} jobs')
        if len(filtered_uuid) > 0:
            print('-- uuid --')
            display(filtered_uuid)
        print('-- Job Metadata --')
        display(filtered_df)

    return filtered_uuid, filtered_df

Filters#

You need to know what the filter keys are.

print("Select-Criteria Key Options:",Unfiltered_df.keys())

Select-Criteria Key Options: Index(['index_column', 'name', 'uuid', 'status', 'appId', 'appVersion',
       'owner', 'created', 'condition', 'remoteStarted', 'ended', 'tenant',
       'execSystemId', 'archiveSystemId', 'lastUpdated', 'created_dt',
       'created_unix', 'created_date', 'remoteStarted_dt',
       'remoteStarted_unix', 'remoteStarted_date', 'ended_dt', 'ended_unix',
       'ended_date', 'lastUpdated_dt', 'lastUpdated_unix', 'lastUpdated_date'],
      dtype='object')

Apply filters#

# User Input
SelectCriteria = {}
SelectCriteria['appId'] = 'opensees-mp-s3'; # this is the web-portal app for OpenSeesMP, use a list for multiple criteria
SelectCriteria['appVersion'] = 'latest'; # this is the version of the web-portal app, use a list for multiple criteria
SelectCriteria['created_date'] = ['2025-05-01','2025-05-05'] # use a single value for a specific date, use a 2-value list for a date range
SelectCriteria['created_date'] = '2025-05-01' # use a single value for a specific date, use a 2-value list for a date range
# SelectCriteria['created'] = ['2025-06-01','2025-06-05'] # use a single value for a specific date, use a 2-value list for a date range
# SelectCriteria['created'] = '2025-06-02' # use a single value for a specific date, use a 2-value list for a date range

filtered_uuid,filtered_df = OpsUtils.filter_tapis_jobs_df(SelectCriteria,Unfiltered_df,displayIt='dataframe')

Found 2 jobs
-- uuid --

['c6fb244f-0ee5-44ae-8181-b6c327fdbb78-007',
 '1e818bd0-975b-492e-9322-55ec561b052f-007']

-- Job Metadata --

	index_column	name	uuid	status	appId	appVersion	owner	created	condition	remoteStarted	...	created_date	remoteStarted_dt	remoteStarted_unix	remoteStarted_date	ended_dt	ended_unix	ended_date	lastUpdated_dt	lastUpdated_unix	lastUpdated_date
391	391	opensees-mp-s3-latest_2025-05-01T23:08:01	c6fb244f-0ee5-44ae-8181-b6c327fdbb78-007	FINISHED	opensees-mp-s3	latest	silvia	2025-05-01T23:08:14.663066Z	None	2025-05-01T23:09:21.251303Z	...	2025-05-01	2025-05-01 23:09:21.251303+00:00	1746140961	2025-05-01	2025-05-01 23:10:28.031897+00:00	1746141028	2025-05-01	2025-05-01 23:10:28.031897+00:00	1746141028	2025-05-01
402	402	opensees-mp-s3-latest_2025-05-01T22:32:40	1e818bd0-975b-492e-9322-55ec561b052f-007	FINISHED	opensees-mp-s3	latest	silvia	2025-05-01T22:38:24.788245Z	None	2025-05-01T22:39:56.323240Z	...	2025-05-01	2025-05-01 22:39:56.323240+00:00	1746139196	2025-05-01	2025-05-01 22:41:14.817659+00:00	1746139274	2025-05-01	2025-05-01 22:41:14.817659+00:00	1746139274	2025-05-01

2 rows × 27 columns