Try on DesignSafe

Filter Tapis Jobs#

by Silvia Mazzoni, DesignSafe, 2025

Let’s create a function that finds a specific job, or set of jobs, that meets certain key criterias – independently of whether you created a job via the web-portal app, or directly via Tapis in a Jupyter Notebook.

Workflow#

  • Connect to Tapis

  • Get a dataframe containing all Job Metadata.

  • We will then query the metadata (pandas dataframe) to find an individual job and then get basic info on that job.

We are going to write a python function so that we may use it with different inputs.

Select a Job Based on an Any Key#

  • filter the jobs dataframe

  • you can have a single key, or many.

  • you can search for one job, or many.

Using local utilities library

Connect to Tapis#

connect_tapis.py
# ../OpsUtils/OpsUtils/Tapis/connect_tapis.py
def connect_tapis(token_filePath: str = "~/.tapis_tokens.json",
                  base_url: str = "https://designsafe.tapis.io",
                  username: str = "",
                  password: str = "",
                  force_connect: bool = False):
    """
    Connect to a Tapis platform (e.g., DesignSafe) with automatic token handling.

    Behavior
    --------
    - Looks for a saved access token at `token_filePath` (default: ~/.tapis_tokens.json).
    - If present and not expired, uses it to create an authenticated Tapis client.
    - If missing/expired, or when `force_connect=True`, prompts for credentials,
      requests new tokens, and saves them back to `token_filePath`.
    - Prints expiration details for transparency.

    Parameters
    ----------
    token_filePath : str, default "~/.tapis_tokens.json"
        Path to the JSON file that stores the Tapis `access_token` and `expires_at`.
    base_url : str, default "https://designsafe.tapis.io"
        Tapis API endpoint base URL.
    username : str, default ""
        Optional preset username. If empty, you will be prompted.
    password : str, default ""
        Optional preset password. If empty, you will be prompted (securely).
    force_connect : bool, default False
        If True, ignores any valid saved token and performs a fresh login.

    Returns
    -------
    object
        An authenticated `Tapis` client object ready to use.

    Notes
    -----
    - The token file stores: `{"access_token": "...", "expires_at": "...ISO8601..."}`.
    - Expiry timestamps are treated as UTC if no timezone is present.
    - If the saved token cannot be parsed/validated, a fresh login is performed.

    Example
    -------
    t = connect_tapis()                        # use saved token or prompt as needed
    jobs = t.jobs.getJobList()                 # now you're authenticated

    Author
    ------
    Silvia Mazzoni, DesignSafe (silviamazzoni@yahoo.com)

    Date
    ----
    2025-08-14

    Version
    -------
    1.0
    """
    from tapipy.tapis import Tapis
    from getpass import getpass
    from datetime import datetime, timezone
    import json
    import os

    def _parse_expires_at(s: str) -> datetime | None:
        """Parse ISO8601 expiry, accepting 'Z' and naive strings; return aware UTC dt or None."""
        if not s:
            return None
        try:
            # normalize trailing 'Z' to +00:00
            s_norm = s.replace("Z", "+00:00")
            dt = datetime.fromisoformat(s_norm)
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=timezone.utc)
            return dt.astimezone(timezone.utc)
        except Exception:
            return None

    def getTokensLoop():
        username = getpass("Username: ")
        password = getpass("Password: ")
        t = Tapis(base_url=base_url, username=username, password=password)
        try:
            t.get_tokens()
            return t
        except Exception as e:
            print(f" ** Warning ** could get token : {e},\n TRY AGAIN!")
            t= getTokensLoop()
            return t

        
    print(" -- Checking Tapis token --")
    token_path = os.path.expanduser(token_filePath)
    now = datetime.now(timezone.utc)

    t = None
    saved_expires_at = None
    valid_token = False

    # Try to load a saved token
    if os.path.exists(token_path):
        try:
            with open(token_path, "r") as f:
                tokens = json.load(f)
            saved_expires_at = _parse_expires_at(tokens.get("expires_at"))
            if tokens.get("access_token") and saved_expires_at and saved_expires_at > now:
                print(" Token loaded from file. Token is still valid!")
                t = Tapis(base_url=base_url, access_token=tokens["access_token"])
                valid_token = True
            else:
                print(" Token file found but token is missing/expired.")
                if saved_expires_at:
                    print(" Token expired at:", saved_expires_at.isoformat())
        except Exception as e:
            print(f" Could not read/parse token file ({token_path}): {e}")
    else:
        print(" No saved tokens found.")

    if force_connect:
        print(" Forcing a connection to Tapis (fresh login).")

    if not valid_token or force_connect:
        print("-- Connect to Tapis --")
        if not username:
            # username isn't sensitive; echoing can help avoid typos, but keeping your original choice:
            username = getpass("Username: ")
        if not password:
            password = getpass("Password: ")
        t = Tapis(base_url=base_url, username=username, password=password)
        try:
            t.get_tokens()
        except Exception as e:
            print(f" ** Warning ** could get token : {e},\n TRY AGAIN!")
            t= getTokensLoop()
        # Save the new token back to the chosen path
        try:
            tokens = {
                "access_token": t.access_token.access_token,
                "expires_at": t.access_token.expires_at.isoformat(),
            }
            os.makedirs(os.path.dirname(token_path), exist_ok=True)
            with open(token_path, "w") as f:
                json.dump(tokens, f)
            print(f" Token saved to {token_path}")
            saved_expires_at = _parse_expires_at(tokens["expires_at"])
        except Exception as e:
            print(f" Warning: could not save token to {token_path}: {e}")

    # Print expiry info (use stored/parsed date if needed)
    exp_to_show = saved_expires_at
    try:
        # if available, prefer the client object's value
        if getattr(t, "access_token", None) and getattr(t.access_token, "expires_at", None):
            exp_to_show = _parse_expires_at(str(t.access_token.expires_at)) or exp_to_show
    except Exception:
        pass

    if exp_to_show:
        print(" Token expires at:", exp_to_show.isoformat())
        print(" Token expires in:", str(exp_to_show - now))
    else:
        print(" Token expiry time unavailable.")

    print("-- LOG IN SUCCESSFUL! --")
    return t
t=OpsUtils.connect_tapis()
 -- Checking Tapis token --
 Token loaded from file. Token is still valid!
 Token expires at: 2025-08-21T02:49:32+00:00
 Token expires in: 3:39:59.079504
-- LOG IN SUCCESSFUL! --

Get All-Jobs Metadata as a dataframe#

it’s best to just get the data and then filter on that dataframe.

get_tapis_jobs_df.py
# ../OpsUtils/OpsUtils/Tapis/get_tapis_jobs_df.py
def get_tapis_jobs_df(t, displayIt=False, NmaxJobs=500):
    """
    Retrieve a list of jobs from Tapis and organize them into a Pandas DataFrame.

    This function fetches up to NmaxJobs from the user's Tapis account, converts the 
    results into a structured DataFrame, adds a convenient index column, and moves key 
    metadata columns (like name, uuid, status) to the front for easier exploration.

    It can also optionally display the DataFrame (entire or just the head) right in 
    the notebook for quick inspection.

    Parameters
    ----------
    t : Tapis
        An authenticated Tapis client (from connect_tapis()).

    displayIt : bool or str, default=False
        If 'head' or 'displayHead', displays only the first few rows.
        If True or 'displayAll', displays the entire DataFrame.
        If False, no display output (just returns the DataFrame).

    NmaxJobs : int, default=500
        Maximum number of jobs to retrieve from Tapis.

    Returns
    -------
    pandas.DataFrame
        DataFrame containing metadata for the fetched jobs.

    Example
    -------
    df = get_tapis_jobs_df(t, displayIt='head', NmaxJobs=1000)
    """
    # Silvia Mazzoni, 2025

    from datetime import datetime, timezone
    import pandas as pd

    # Get jobs from Tapis
    jobslist = t.jobs.getJobList(limit=NmaxJobs)
    
    # Convert TapisResult objects to dictionaries
    jobsdicts = [job.__dict__ for job in jobslist]
    
    # Build DataFrame
    df = pd.DataFrame(jobsdicts)
    
    # Add index column for convenience
    df["index_column"] = df.index
    
    # add formatted data
    for thisK in ['created','remoteStarted', 'ended','lastUpdated']:
        df[f'{thisK}_dt'] = pd.to_datetime(df[thisK], utc=True)
        df[f'{thisK}_unix'] = df[f'{thisK}_dt'].astype('int64') // 10**9
        df[f'{thisK}_date'] = df[f'{thisK}_unix'].apply(
                    lambda x: datetime.fromtimestamp(x, tz=timezone.utc).date()
                )

    
    # Reorder columns: put key ones first if they exist
    startCols = ['index_column', 'name', 'uuid', 'status', 'appId', 'appVersion']
    existingStartCols = [col for col in startCols if col in df.columns]
    remainingCols = [col for col in df.columns if col not in existingStartCols]
    columns = existingStartCols + remainingCols
    df = df[columns]
    
    # Optional display logic
    if displayIt != False:
        print(f'Found {len(df)} jobs')
        
        if displayIt in [True] or displayIt.lower() in ['display','displayall','all']:
            display(df)
        elif displayIt.lower() in ['head', 'displayHead']:
            display(df.head())
    
    return df



    
Unfiltered_df = OpsUtils.get_tapis_jobs_df(t)
display(Unfiltered_df)
index_column name uuid status appId appVersion owner created condition remoteStarted ... created_date remoteStarted_dt remoteStarted_unix remoteStarted_date ended_dt ended_unix ended_date lastUpdated_dt lastUpdated_unix lastUpdated_date
0 0 opensees-mp-s3-3.6.0_2024-06-13T18:18:01 0d3c401b-1807-45a7-904c-ea50d381d2ca-007 FAILED opensees-mp-s3 3.6.0 silvia 2024-06-20T21:29:15.384516Z None None ... 2024-06-20 NaT -9223372037 1677-09-21 2024-06-20 21:40:18.629466+00:00 1718919618 2024-06-20 2024-06-20 21:40:18.629466+00:00 1718919618 2024-06-20
1 1 opensees-mp-s3-3.6.0_2024-06-13T18:18:01 a983892a-f8a7-45cd-91a9-fe87747bb49c-007 FAILED opensees-mp-s3 3.6.0 silvia 2024-06-13T18:18:10.809303Z None None ... 2024-06-13 NaT -9223372037 1677-09-21 2024-06-13 18:19:12.957455+00:00 1718302752 2024-06-13 2024-06-13 18:19:12.957455+00:00 1718302752 2024-06-13
2 2 opensees-interactive-3.7.0_2024-10-08T20:57:15 65e05dee-13d9-4b56-8d83-f9957991f28a-007 FAILED opensees-interactive 3.7.0 silvia 2024-10-08T20:57:24.066497Z None 2024-10-08T20:57:46.786802Z ... 2024-10-08 2024-10-08 20:57:46.786802+00:00 1728421066 2024-10-08 2024-10-08 21:36:31.742985+00:00 1728423391 2024-10-08 2024-10-08 21:36:31.742985+00:00 1728423391 2024-10-08
3 3 opensees-interactive-3.7.0_2024-08-11T02:15:19 d5ca8c92-6858-4c2d-84c8-de3a6891fab6-007 FAILED opensees-interactive 3.7.0 silvia 2024-08-11T02:15:36.208122Z None 2024-08-11T02:15:58.677513Z ... 2024-08-11 2024-08-11 02:15:58.677513+00:00 1723342558 2024-08-11 2024-08-12 02:18:52.616413+00:00 1723429132 2024-08-12 2024-08-12 02:18:52.627855+00:00 1723429132 2024-08-12
4 4 opensees-interactive-3.7.0_2024-08-14T01:48:24 aba14450-27bc-42ae-93f7-2330afdd6462-007 FAILED opensees-interactive 3.7.0 silvia 2024-08-14T01:48:32.046028Z None 2024-08-14T01:48:54.607485Z ... 2024-08-14 2024-08-14 01:48:54.607485+00:00 1723600134 2024-08-14 2024-08-14 03:29:59.323250+00:00 1723606199 2024-08-14 2024-08-14 03:29:59.333848+00:00 1723606199 2024-08-14
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
495 495 opensees-interactive-latest_2025-06-19T21:12:20 d04b3140-acde-4d64-a8c1-7d8c46f5b25c-007 FAILED opensees-interactive latest silvia 2025-06-19T21:27:49.686635Z None 2025-06-19T21:28:12.183247Z ... 2025-06-19 2025-06-19 21:28:12.183247+00:00 1750368492 2025-06-19 2025-06-19 21:29:01.579244+00:00 1750368541 2025-06-19 2025-06-19 21:29:01.579244+00:00 1750368541 2025-06-19
496 496 opensees-interactive-latest_2025-06-19T21:12:20 fb630e5c-69f1-4370-b49e-b01e9d3da673-007 FAILED opensees-interactive latest silvia 2025-06-19T22:38:07.428425Z None 2025-06-19T22:38:40.331526Z ... 2025-06-19 2025-06-19 22:38:40.331526+00:00 1750372720 2025-06-19 2025-06-19 22:39:08.866665+00:00 1750372748 2025-06-19 2025-06-19 22:39:08.866665+00:00 1750372748 2025-06-19
497 497 OpenSees_agnostic-app-test_zipFolderOut 967f77be-8108-4750-bafa-f1bf9ddc34d8-007 FINISHED agnostic-app-test 0.0.21 silvia 2025-08-20T01:40:15.974840Z None 2025-08-20T01:41:34.213701Z ... 2025-08-20 2025-08-20 01:41:34.213701+00:00 1755654094 2025-08-20 2025-08-20 01:42:28.713746+00:00 1755654148 2025-08-20 2025-08-20 01:42:28.713746+00:00 1755654148 2025-08-20
498 498 opensees-MP-multiMotion-dapi 8d9405ff-ffc0-46f3-b20a-930596ff4481-007 FAILED opensees-mp-s3 latest silvia 2025-06-09T23:56:14.759209Z None None ... 2025-06-09 NaT -9223372037 1677-09-21 2025-06-09 23:57:25.251713+00:00 1749513445 2025-06-09 2025-06-09 23:57:25.251713+00:00 1749513445 2025-06-09
499 499 OpsTrain_JobSubmit_WebPortal f3633ed7-116a-45ec-bb8f-100978f6e5b7-007 FINISHED opensees-mp-s3 latest silvia 2025-06-03T05:57:51.692995Z None 2025-06-03T05:59:15.487231Z ... 2025-06-03 2025-06-03 05:59:15.487231+00:00 1748930355 2025-06-03 2025-06-03 06:02:53.709377+00:00 1748930573 2025-06-03 2025-06-03 06:02:53.709377+00:00 1748930573 2025-06-03

500 rows × 27 columns

Filter Job Metadata#

I have written a python function filter_tapis_jobs_df() that will filter the dataframe based on our inputs.

  • Filters a Tapis jobs dataframe (filtered_df) based on the SelectCriteria dict.

  • Handles:

    • time fields (string ranges converted to Unix timestamps)

    • direct lists of categories (isin)

    • single values or dates

  • Returns:

    • list of filtered uuids

    • the filtered dataframe

  • Optionally displays the dataframe & uuid list.

filter_tapis_jobs_df.py
# ../OpsUtils/OpsUtils/Tapis/filter_tapis_jobs_df.py
def filter_tapis_jobs_df(SelectCriteria, filtered_df, displayIt=False):
    """
    Filter a Tapis jobs DataFrame based on a flexible SelectCriteria dictionary.

    Parameters
    ----------
    SelectCriteria : dict
        A dictionary where keys are column names (like 'status', 'appId', or time fields)
        and values are either:
            - A list of values for `isin` checks,
            - A two-element list for ranges (especially dates),
            - Or a single value for exact matching.

    filtered_df : pandas.DataFrame
        The DataFrame to filter, typically generated by get_tapis_jobs_df().
        Should include time columns like created_unix, created_dt, created_date.

    displayIt : bool, optional
        If True, prints a summary of how many jobs matched, their UUIDs,
        and displays the filtered DataFrame.

    Returns
    -------
    tuple
        (filtered_uuid, filtered_df)
        - filtered_uuid : list of uuids matching the filters
        - filtered_df : the filtered DataFrame

    Filtering Logic
    ---------------
    - Recognizes time fields: 'created', 'remoteStarted', 'ended', 'lastUpdated'
      and handles any of their suffixes (_unix, _dt, _date) intelligently.
    - If the SelectCriteria key is one of these, it supports:
        - Ranges: ['2024-08-01', '2024-08-31'] → filters on unix or datetime columns.
        - Single dates: '2024-08-15' → matches that specific day.
    - For all other keys:
        - Lists are used with isin().
        - Single values are checked with ==.

    Notes
    -----
    - Internally converts timestamp strings to Unix time to allow robust comparisons.
    - Handles missing or malformed timestamps gracefully by excluding them.
    - Will skip any keys not present in the DataFrame.
    """
    # Silvia Mazzoni, 2025
    from datetime import datetime, timezone
    import re
    
    def convert_time_unix(timestamp_str):
        try:
            if not timestamp_str:
                return -1
            ts_clean = timestamp_str.rstrip('Z')
            if 'T' in ts_clean:
                fmt = "%Y-%m-%dT%H:%M:%S.%f" if re.search(r'\.\d+', ts_clean) else "%Y-%m-%dT%H:%M:%S"
            else:
                fmt = "%Y-%m-%d"
            outTime = datetime.strptime(ts_clean, fmt)
            outTime = outTime.replace(tzinfo=timezone.utc)
            return outTime.timestamp()
        except Exception:
            return -1

    time_keys = ['created', 'remoteStarted', 'ended', 'lastUpdated']
    time_keys_unix = [f"{k}_unix" for k in time_keys]
    time_keys_dt = [f"{k}_dt" for k in time_keys]
    time_keys_date = [f"{k}_date" for k in time_keys]

    for key, values in SelectCriteria.items():
        if key not in filtered_df.columns:
            continue

        # Handle ranges
        if isinstance(values, list) and len(values) == 2:
            if key in time_keys or key in time_keys_unix:
                # convert date strings to unix
                min_time = convert_time_unix(values[0]) if key in time_keys else values[0]
                max_time = convert_time_unix(values[1]) if key in time_keys else values[1]
                filtered_df = filtered_df[
                    (filtered_df[f"{key}_unix"] >= min_time) & (filtered_df[f"{key}_unix"] <= max_time)
                ]
            elif key in time_keys_dt:
                filtered_df = filtered_df[
                    (filtered_df[key] >= values[0]) & (filtered_df[key] <= values[1])
                ]
            elif key in time_keys_date:
                filtered_df = filtered_df[
                    (filtered_df[key] >= values[0]) & (filtered_df[key] <= values[1])
                ]
            else:
                filtered_df = filtered_df[filtered_df[key].isin(values)]

        # Handle single date match on _date
        elif key in time_keys + time_keys_unix + time_keys_dt + time_keys_date:
            try:
                target_date = datetime.strptime(values, "%Y-%m-%d").date()
                for tk in time_keys:
                    if key.startswith(tk):
                        filtered_df = filtered_df[filtered_df[f"{tk}_date"] == target_date]
            except Exception:
                pass

        # Handle single exact value
        else:
            filtered_df = filtered_df[filtered_df[key] == values]

    filtered_uuid = list(filtered_df['uuid'])

    if displayIt:
        print(f'Found {len(filtered_df)} jobs')
        if len(filtered_uuid) > 0:
            print('-- uuid --')
            display(filtered_uuid)
        print('-- Job Metadata --')
        display(filtered_df)

    return filtered_uuid, filtered_df

Filters#

You need to know what the filter keys are.

print("Select-Criteria Key Options:",Unfiltered_df.keys())
Select-Criteria Key Options: Index(['index_column', 'name', 'uuid', 'status', 'appId', 'appVersion',
       'owner', 'created', 'condition', 'remoteStarted', 'ended', 'tenant',
       'execSystemId', 'archiveSystemId', 'lastUpdated', 'created_dt',
       'created_unix', 'created_date', 'remoteStarted_dt',
       'remoteStarted_unix', 'remoteStarted_date', 'ended_dt', 'ended_unix',
       'ended_date', 'lastUpdated_dt', 'lastUpdated_unix', 'lastUpdated_date'],
      dtype='object')

Apply filters#

# User Input
SelectCriteria = {}
SelectCriteria['appId'] = 'opensees-mp-s3'; # this is the web-portal app for OpenSeesMP, use a list for multiple criteria
SelectCriteria['appVersion'] = 'latest'; # this is the version of the web-portal app, use a list for multiple criteria
SelectCriteria['created_date'] = ['2025-05-01','2025-05-05'] # use a single value for a specific date, use a 2-value list for a date range
SelectCriteria['created_date'] = '2025-05-01' # use a single value for a specific date, use a 2-value list for a date range
# SelectCriteria['created'] = ['2025-06-01','2025-06-05'] # use a single value for a specific date, use a 2-value list for a date range
# SelectCriteria['created'] = '2025-06-02' # use a single value for a specific date, use a 2-value list for a date range
filtered_uuid,filtered_df = OpsUtils.filter_tapis_jobs_df(SelectCriteria,Unfiltered_df,displayIt='dataframe')
Found 2 jobs
-- uuid --
['c6fb244f-0ee5-44ae-8181-b6c327fdbb78-007',
 '1e818bd0-975b-492e-9322-55ec561b052f-007']
-- Job Metadata --
index_column name uuid status appId appVersion owner created condition remoteStarted ... created_date remoteStarted_dt remoteStarted_unix remoteStarted_date ended_dt ended_unix ended_date lastUpdated_dt lastUpdated_unix lastUpdated_date
391 391 opensees-mp-s3-latest_2025-05-01T23:08:01 c6fb244f-0ee5-44ae-8181-b6c327fdbb78-007 FINISHED opensees-mp-s3 latest silvia 2025-05-01T23:08:14.663066Z None 2025-05-01T23:09:21.251303Z ... 2025-05-01 2025-05-01 23:09:21.251303+00:00 1746140961 2025-05-01 2025-05-01 23:10:28.031897+00:00 1746141028 2025-05-01 2025-05-01 23:10:28.031897+00:00 1746141028 2025-05-01
402 402 opensees-mp-s3-latest_2025-05-01T22:32:40 1e818bd0-975b-492e-9322-55ec561b052f-007 FINISHED opensees-mp-s3 latest silvia 2025-05-01T22:38:24.788245Z None 2025-05-01T22:39:56.323240Z ... 2025-05-01 2025-05-01 22:39:56.323240+00:00 1746139196 2025-05-01 2025-05-01 22:41:14.817659+00:00 1746139274 2025-05-01 2025-05-01 22:41:14.817659+00:00 1746139274 2025-05-01

2 rows × 27 columns