Implement Delete timeouts

* Implements new logic around when instances are deleted. See flowcharts in docs/. Basically timeouts now trigger to actually delete instances from a1s database, and these timeouts are configurable.
* Eliminates the barrier to deleting an instance when no xapp evdr replied (via timeouts)
* Add two new ENV variables that control timeouts
* Make unit tests more modular so new workflows can be tested easily
* Changes the API for ../status to return a richer structure
* Clean up unused items in the integration tests helm chart
* Removed "RMR_RCV_RETRY_INTERVAL" leftovers since this isn't used
anymore

Issue-ID: RICPLT-2619
Signed-off-by: Tommy Carpenter <tc677g@att.com>
Change-Id: I5b2f9cc3a6e8da7fe636a4cde085a5e1a3770f62
diff --git a/Dockerfile-Unit-Test b/Dockerfile-Unit-Test
index 6be53d5..b5d524c 100644
--- a/Dockerfile-Unit-Test
+++ b/Dockerfile-Unit-Test
@@ -44,4 +44,4 @@
 WORKDIR /tmp
 
 # Run the unit tests
-RUN tox
+RUN tox -e py37, flake8
diff --git a/a1/a1rmr.py b/a1/a1rmr.py
index fd0d87f..38f8373 100644
--- a/a1/a1rmr.py
+++ b/a1/a1rmr.py
@@ -30,7 +30,7 @@
 logger = get_module_logger(__name__)
 
 
-RETRY_TIMES = int(os.environ.get("RMR_RETRY_TIMES", 4))
+RETRY_TIMES = int(os.environ.get("A1_RMR_RETRY_TIMES", 4))
 
 # Note; yes, globals are bad, but this is a private (to this module) global
 # No other module can import/access this (well, python doesn't enforce this, but all linters will complain)
@@ -97,23 +97,16 @@
                         break
 
             # read our mailbox and update statuses
-            updated_instances = set()
             for msg in self.rcv_func():
                 try:
                     pay = json.loads(msg["payload"])
                     pti = pay["policy_type_id"]
                     pii = pay["policy_instance_id"]
-                    data.set_status(pti, pii, pay["handler_id"], pay["status"])
-                    updated_instances.add((pti, pii))
+                    data.set_policy_instance_status(pti, pii, pay["handler_id"], pay["status"])
                 except (PolicyTypeNotFound, PolicyInstanceNotFound, KeyError, json.decoder.JSONDecodeError):
                     # TODO: in the future we may also have to catch SDL errors
                     logger.debug(("Dropping malformed or non applicable message", msg))
 
-            # for all updated instances, see if we can trigger a delete
-            # should be no catch needed here, since the status update would have failed if it was a bad pair
-            for ut in updated_instances:
-                data.clean_up_instance(ut[0], ut[1])
-
             # TODO: what's a reasonable sleep time? we don't want to hammer redis too much, and a1 isn't a real time component
             self.last_ran = time.time()
             time.sleep(1)
diff --git a/a1/controller.py b/a1/controller.py
index 75d8456..6f21830 100644
--- a/a1/controller.py
+++ b/a1/controller.py
@@ -142,14 +142,7 @@
         3. "NOT IN EFFECT" otherwise (no statuses, or none are OK but not all are deleted)
     """
 
-    def get_status_handler():
-        vector = data.get_policy_instance_statuses(policy_type_id, policy_instance_id)
-        for i in vector:
-            if i == "OK":
-                return "IN EFFECT", 200
-        return "NOT IN EFFECT", 200
-
-    return _try_func_return(get_status_handler)
+    return _try_func_return(lambda: data.get_policy_instance_status(policy_type_id, policy_instance_id))
 
 
 def create_or_replace_policy_instance(policy_type_id, policy_instance_id):
@@ -188,8 +181,9 @@
     def delete_instance_handler():
         """
         here we send out the DELETEs but we don't delete the instance until a GET is called where we check the statuses
+        We also set the status as deleted which would be reflected in a GET to ../status (before the DELETE completes)
         """
-        data.instance_is_valid(policy_type_id, policy_instance_id)
+        data.delete_policy_instance(policy_type_id, policy_instance_id)
 
         # send rmr (best effort)
         body = _gen_body_to_handler("DELETE", policy_type_id, policy_instance_id)
diff --git a/a1/data.py b/a1/data.py
index 074f326..d25f519 100644
--- a/a1/data.py
+++ b/a1/data.py
@@ -23,6 +23,9 @@
 For now, the database is in memory.
 We use dict data structures (KV) with the expectation of having to move this into Redis
 """
+import os
+import time
+from threading import Thread
 import msgpack
 from a1.exceptions import PolicyTypeNotFound, PolicyInstanceNotFound, PolicyTypeAlreadyExists, CantDeleteNonEmptyType
 from a1 import get_module_logger
@@ -30,6 +33,10 @@
 logger = get_module_logger(__name__)
 
 
+INSTANCE_DELETE_NO_RESP_TTL = int(os.environ.get("INSTANCE_DELETE_NO_RESP_TTL", 5))
+INSTANCE_DELETE_RESP_TTL = int(os.environ.get("INSTANCE_DELETE_RESP_TTL", 5))
+
+
 class SDLWrapper:
     """
     This is a wrapper around the expected SDL Python interface.
@@ -65,6 +72,7 @@
 
 TYPE_PREFIX = "a1.policy_type."
 INSTANCE_PREFIX = "a1.policy_instance."
+METADATA_PREFIX = "a1.policy_inst_metadata."
 HANDLER_PREFIX = "a1.policy_handler."
 
 
@@ -85,6 +93,13 @@
     return "{0}{1}.{2}".format(INSTANCE_PREFIX, policy_type_id, policy_instance_id)
 
 
+def _generate_instance_metadata_key(policy_type_id, policy_instance_id):
+    """
+    generate a key for a policy instance metadata
+    """
+    return "{0}{1}.{2}".format(METADATA_PREFIX, policy_type_id, policy_instance_id)
+
+
 def _generate_handler_prefix(policy_type_id, policy_instance_id):
     """
     generate the prefix to a handler key
@@ -99,11 +114,28 @@
     return "{0}{1}".format(_generate_handler_prefix(policy_type_id, policy_instance_id), handler_id)
 
 
+def _type_is_valid(policy_type_id):
+    """
+    check that a type is valid
+    """
+    if SDL.get(_generate_type_key(policy_type_id)) is None:
+        raise PolicyTypeNotFound()
+
+
+def _instance_is_valid(policy_type_id, policy_instance_id):
+    """
+    check that an instance is valid
+    """
+    _type_is_valid(policy_type_id)
+    if SDL.get(_generate_instance_key(policy_type_id, policy_instance_id)) is None:
+        raise PolicyInstanceNotFound
+
+
 def _get_statuses(policy_type_id, policy_instance_id):
     """
     shared helper to get statuses for an instance
     """
-    instance_is_valid(policy_type_id, policy_instance_id)
+    _instance_is_valid(policy_type_id, policy_instance_id)
     prefixes_for_handler = "{0}{1}.{2}.".format(HANDLER_PREFIX, policy_type_id, policy_instance_id)
     return list(SDL.find_and_get(prefixes_for_handler).values())
 
@@ -112,7 +144,7 @@
     """
     shared helper to get instance list for a type
     """
-    type_is_valid(policy_type_id)
+    _type_is_valid(policy_type_id)
     prefixes_for_type = "{0}{1}.".format(INSTANCE_PREFIX, policy_type_id)
     instancekeys = SDL.find_and_get(prefixes_for_type).keys()
     return [k.split(prefixes_for_type)[1] for k in instancekeys]
@@ -128,6 +160,32 @@
         SDL.delete(k)
 
 
+def _get_metadata(policy_type_id, policy_instance_id):
+    """
+    get instance metadata
+    """
+    _instance_is_valid(policy_type_id, policy_instance_id)
+    metadata_key = _generate_instance_metadata_key(policy_type_id, policy_instance_id)
+    return SDL.get(metadata_key)
+
+
+def _delete_after(policy_type_id, policy_instance_id, ttl):
+    """
+    this is a blocking function, must call this in a thread to not block!
+    waits ttl seconds, then deletes the instance
+    """
+    _instance_is_valid(policy_type_id, policy_instance_id)
+
+    time.sleep(ttl)
+
+    # ready to delete
+    _clear_handlers(policy_type_id, policy_instance_id)  # delete all the handlers
+    SDL.delete(_generate_instance_key(policy_type_id, policy_instance_id))  # delete instance
+    SDL.delete(_generate_instance_metadata_key(policy_type_id, policy_instance_id))  # delete instance metadata
+    logger.debug("type %s instance %s deleted", policy_type_id, policy_instance_id)
+    raise PolicyInstanceNotFound()
+
+
 # Types
 
 
@@ -140,14 +198,6 @@
     return [int(k.split(TYPE_PREFIX)[1]) for k in typekeys]
 
 
-def type_is_valid(policy_type_id):
-    """
-    check that a type is valid
-    """
-    if SDL.get(_generate_type_key(policy_type_id)) is None:
-        raise PolicyTypeNotFound()
-
-
 def store_policy_type(policy_type_id, body):
     """
     store a policy type if it doesn't already exist
@@ -173,49 +223,39 @@
     """
     retrieve a type
     """
-    type_is_valid(policy_type_id)
+    _type_is_valid(policy_type_id)
     return SDL.get(_generate_type_key(policy_type_id))
 
 
 # Instances
 
 
-def instance_is_valid(policy_type_id, policy_instance_id):
-    """
-    check that an instance is valid
-    """
-    type_is_valid(policy_type_id)
-    if SDL.get(_generate_instance_key(policy_type_id, policy_instance_id)) is None:
-        raise PolicyInstanceNotFound
-
-
 def store_policy_instance(policy_type_id, policy_instance_id, instance):
     """
     Store a policy instance
     """
-    type_is_valid(policy_type_id)
+    _type_is_valid(policy_type_id)
+    creation_timestamp = time.time()
+
+    # store the instance
     key = _generate_instance_key(policy_type_id, policy_instance_id)
     if SDL.get(key) is not None:
         # Reset the statuses because this is a new policy instance, even if it was overwritten
         _clear_handlers(policy_type_id, policy_instance_id)  # delete all the handlers
     SDL.set(key, instance)
 
+    metadata_key = _generate_instance_metadata_key(policy_type_id, policy_instance_id)
+    SDL.set(metadata_key, {"created_at": creation_timestamp, "has_been_deleted": False})
+
 
 def get_policy_instance(policy_type_id, policy_instance_id):
     """
     Retrieve a policy instance
     """
-    instance_is_valid(policy_type_id, policy_instance_id)
+    _instance_is_valid(policy_type_id, policy_instance_id)
     return SDL.get(_generate_instance_key(policy_type_id, policy_instance_id))
 
 
-def get_policy_instance_statuses(policy_type_id, policy_instance_id):
-    """
-    Retrieve the status vector for a policy instance
-    """
-    return _get_statuses(policy_type_id, policy_instance_id)
-
-
 def get_instance_list(policy_type_id):
     """
     retrieve all instance ids for a type
@@ -223,48 +263,57 @@
     return _get_instance_list(policy_type_id)
 
 
+def delete_policy_instance(policy_type_id, policy_instance_id):
+    """
+    initially sets has_been_deleted
+    then launches a thread that waits until the relevent timer expires, and finally deletes the instance
+    """
+    _instance_is_valid(policy_type_id, policy_instance_id)
+
+    # set the metadata first
+    deleted_timestamp = time.time()
+    metadata_key = _generate_instance_metadata_key(policy_type_id, policy_instance_id)
+    existing_metadata = _get_metadata(policy_type_id, policy_instance_id)
+    SDL.set(
+        metadata_key,
+        {"created_at": existing_metadata["created_at"], "has_been_deleted": True, "deleted_at": deleted_timestamp},
+    )
+
+    # wait, then delete
+    vector = _get_statuses(policy_type_id, policy_instance_id)
+    if vector == []:
+        # handler is empty; we wait for t1 to expire then goodnight
+        clos = lambda: _delete_after(policy_type_id, policy_instance_id, INSTANCE_DELETE_NO_RESP_TTL)
+    else:
+        # handler is not empty, we wait max t1,t2 to expire then goodnight
+        clos = lambda: _delete_after(
+            policy_type_id, policy_instance_id, max(INSTANCE_DELETE_RESP_TTL, INSTANCE_DELETE_NO_RESP_TTL)
+        )
+    Thread(target=clos).start()
+
+
 # Statuses
 
 
-def set_status(policy_type_id, policy_instance_id, handler_id, status):
+def set_policy_instance_status(policy_type_id, policy_instance_id, handler_id, status):
     """
     update the database status for a handler
     called from a1's rmr thread
     """
-    type_is_valid(policy_type_id)
-    instance_is_valid(policy_type_id, policy_instance_id)
+    _type_is_valid(policy_type_id)
+    _instance_is_valid(policy_type_id, policy_instance_id)
     SDL.set(_generate_handler_key(policy_type_id, policy_instance_id, handler_id), status)
 
 
-def clean_up_instance(policy_type_id, policy_instance_id):
+def get_policy_instance_status(policy_type_id, policy_instance_id):
     """
-    see if we can delete an instance based on it's status
+    Gets the status of an instance
     """
-    type_is_valid(policy_type_id)
-    instance_is_valid(policy_type_id, policy_instance_id)
-
-    """
-    TODO: not being able to delete if the list is [] is prolematic.
-    There are cases, such as a bad routing file, where this type will never be able to be deleted because it never went to any xapps
-    However, A1 cannot distinguish between the case where [] was never going to work, and the case where it hasn't worked *yet*
-
-    However, removing this constraint also leads to problems.
-    Deleting the instance when the vector is empty, for example doing so “shortly after” the PUT, can lead to a worse race condition where the xapps get the policy after that, implement it, but because the DELETE triggered “too soon”, you can never get the status or do the delete on it again, so the xapps are all implementing the instance roguely.
-
-    This requires some thought to address.
-    For now we stick with the "less bad problem".
-    """
-
-    vector = _get_statuses(policy_type_id, policy_instance_id)
-    if vector != []:
-        all_deleted = True
-        for i in vector:
-            if i != "DELETED":
-                all_deleted = False
-                break  # have at least one not DELETED, do nothing
-
-        # blow away from a1 db
-        if all_deleted:
-            _clear_handlers(policy_type_id, policy_instance_id)  # delete all the handlers
-            SDL.delete(_generate_instance_key(policy_type_id, policy_instance_id))  # delete instance
-            logger.debug("type %s instance %s deleted", policy_type_id, policy_instance_id)
+    _instance_is_valid(policy_type_id, policy_instance_id)
+    metadata = _get_metadata(policy_type_id, policy_instance_id)
+    metadata["instance_status"] = "NOT IN EFFECT"
+    for i in _get_statuses(policy_type_id, policy_instance_id):
+        if i == "OK":
+            metadata["instance_status"] = "IN EFFECT"
+            break
+    return metadata
diff --git a/a1/openapi.yaml b/a1/openapi.yaml
index fed4b77..6975bf6 100644
--- a/a1/openapi.yaml
+++ b/a1/openapi.yaml
@@ -16,7 +16,7 @@
 # ==================================================================================
 openapi: 3.0.0
 info:
-  version: 1.0.0
+  version: 1.1.0
   title: RIC A1
 paths:
   '/a1-p/healthcheck':
@@ -273,12 +273,21 @@
           description: >
             successfully retrieved the status
           content:
-            text/plain:
+            application/json:
               schema:
-                type: string
-                enum:
-                 - IN EFFECT
-                 - NOT IN EFFECT
+                type: object
+                properties:
+                  instance_status:
+                    type: string
+                    enum:
+                     - IN EFFECT
+                     - NOT IN EFFECT
+                  has_been_deleted:
+                    type: boolean
+                  created_at:
+                    type: string
+                    format: date-time
+
         '404':
           description: >
             there is no policy instance with this policy_instance_id or there is no policy type with this policy_type_id
diff --git a/docs/deleted flowchart.pdf b/docs/deleted flowchart.pdf
new file mode 100644
index 0000000..0089b29
--- /dev/null
+++ b/docs/deleted flowchart.pdf
Binary files differ
diff --git a/docs/installation-guide.rst b/docs/installation-guide.rst
index 5ddad48..babd4fb 100644
--- a/docs/installation-guide.rst
+++ b/docs/installation-guide.rst
@@ -1,5 +1,6 @@
 .. This work is licensed under a Creative Commons Attribution 4.0 International License.
 .. http://creativecommons.org/licenses/by/4.0
+.. Copyright (C) 2019 AT&T Intellectual Property
 
 Installation Guide
 ==================
@@ -13,9 +14,11 @@
 
 You can set the following ENVs to change the A1 behavior:
 
-1. ``RMR_RETRY_TIMES`` the number of times failed rmr operations such as
-timeouts and send failures should be retried before A1 gives up and
-returns a 503. The default is ``4``.
+1. ``A1_RMR_RETRY_TIMES``: the number of times failed rmr operations such as timeouts and send failures should be retried before A1 gives up and returns a 503. The default is ``4``.
+
+2. ``INSTANCE_DELETE_NO_RESP_TTL``: Please refer to the delete flowchart in docs/; this is ``T1`` there. The default is 5 (seconds). Basically, the number of seconds that a1 waits to remove an instance from the database after a delete is called in the case that no downstream apps responded.
+
+3. ``INSTANCE_DELETE_RESP_TTL``: Please refer to the delete flowchart in docs/; this is ``T2`` there. The default is 5 (seconds). Basically, the number of seconds that a1 waits to remove an instance from the database after a delete is called in the case that downstream apps responded.
 
 K8S
 ---
diff --git a/docs/policy instance state diagram.pdf b/docs/policy instance state diagram.pdf
new file mode 100644
index 0000000..d25d339
--- /dev/null
+++ b/docs/policy instance state diagram.pdf
Binary files differ
diff --git a/docs/release-notes.rst b/docs/release-notes.rst
index 49c3a18..daaa241 100644
--- a/docs/release-notes.rst
+++ b/docs/release-notes.rst
@@ -1,5 +1,6 @@
 .. This work is licensed under a Creative Commons Attribution 4.0 International License.
 .. http://creativecommons.org/licenses/by/4.0
+.. Copyright (C) 2019 AT&T Intellectual Property
 
 Release Notes
 ===============
@@ -13,7 +14,6 @@
    :depth: 3
    :local:
 
-
 [1.x.x] - TBD
 -------------
 
@@ -21,8 +21,23 @@
 
     * Represents a resillent version of 1.0.0 that uses Redis for persistence
 
-[1.0.4] - 10/24/2019
---------------------
+
+
+[x.x.x] - TBD
+-------------
+
+::
+
+    * Implements new logic around when instances are deleted. See flowcharts in docs/. Basically timeouts now trigger to actually delete instances from a1s database, and these timeouts are configurable.
+    * Eliminates the barrier to deleting an instance when no xapp evdr replied (via timeouts)
+    * Add two new ENV variables that control timeouts
+    * Make unit tests more modular so new workflows can be tested easily
+    * Changes the API for ../status to return a richer structure
+    * Clean up unused items in the integration tests helm chart
+    * Removed "RMR_RCV_RETRY_INTERVAL" leftovers since this isn't used anymore
+
+[1.0.4]
+-------
 
 ::
 
diff --git a/integration_tests/a1mediator/templates/deployment.yaml b/integration_tests/a1mediator/templates/deployment.yaml
index e94045d..a6e0786 100644
--- a/integration_tests/a1mediator/templates/deployment.yaml
+++ b/integration_tests/a1mediator/templates/deployment.yaml
@@ -30,9 +30,13 @@
             value: {{ .Values.rmrservice.name }}
           - name: PYTHONUNBUFFERED
             value: "1"
-          - name: RMR_RETRY_TIMES
+          - name: A1_RMR_RETRY_TIMES
             value: "{{ .Values.rmr_timeout_config.rcv_retry_times }}"
-          image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
+          - name: INSTANCE_DELETE_NO_RESP_TTL
+            value: "5"
+          - name: INSTANCE_DELETE_RESP_TTL
+            value: "10"
+          image: "a1:latest"
           imagePullPolicy: {{ .Values.image.pullPolicy }}
           ports:
             - name: http
diff --git a/integration_tests/a1mediator/templates/secrets.yaml b/integration_tests/a1mediator/templates/secrets.yaml
deleted file mode 100644
index 4afe4ec..0000000
--- a/integration_tests/a1mediator/templates/secrets.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-apiVersion: v1
-kind: Secret
-metadata:
-  name: lfhelper
-type: kubernetes.io/dockerconfigjson
-data:
-  .dockerconfigjson: {{ template "imagePullSecret" . }}
diff --git a/integration_tests/a1mediator/values.yaml b/integration_tests/a1mediator/values.yaml
index 5924f1d..e8bd0d2 100644
--- a/integration_tests/a1mediator/values.yaml
+++ b/integration_tests/a1mediator/values.yaml
@@ -8,13 +8,6 @@
 # name of the secret that allows for privagte registry docker pulls.
 # if the value is "lfhelper", there is a helper function included in this chart, and it uses imageCredentials .
 # imageCredentials is referenced in secrets.yaml, and uses a helper function to formulate the docker reg username and password into a valid dockerconfig.json.
-# Note, if the value of lf_docker_reg_secret is changed, these image credentials are ignored and not used.
-lf_docker_reg_secret: lfhelper
-imageCredentials:
-  registry: nexus3.o-ran-sc.org:10004/ric-plt-a1
-  username:
-  password:
-
 # This is the service for A1's external facing HTTP API
 httpservice:
   name: a1httpservice
diff --git a/integration_tests/putdata b/integration_tests/putdata
deleted file mode 100644
index a2f8d11..0000000
--- a/integration_tests/putdata
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-    "enforce":true,
-    "window_length":10,
-    "blocking_rate":20,
-    "trigger_threshold":10
-}
diff --git a/integration_tests/test_a1.tavern.yaml b/integration_tests/test_a1.tavern.yaml
index b4ccd62..94fe0de 100644
--- a/integration_tests/test_a1.tavern.yaml
+++ b/integration_tests/test_a1.tavern.yaml
@@ -176,7 +176,9 @@
       method: GET
     response:
       status_code: 200
-      # tavern doesn't yet let you check string statuses!!!
+      body:
+        instance_status: "IN EFFECT"
+        has_been_deleted: False
 
   - name: instance list 200 and contains the instance
     request:
@@ -189,13 +191,25 @@
 
   # DELETE the instance and make sure subsequent GETs return properly
   - name: delete the instance
-    delay_after: 3 # give it a few seconds for rmr
+    delay_after: 4
     request:
       url: http://localhost:10000/a1-p/policytypes/20000/policies/admission_control_policy
       method: DELETE
     response:
       status_code: 202
 
+  - name: status should now be not in effect but still there
+    delay_before: 3 # give it a few seconds for rmr
+    delay_after: 8 # 3 + 11 > 10; that is, wait until t2 expires
+    request:
+      url: http://localhost:10000/a1-p/policytypes/20000/policies/admission_control_policy/status
+      method: GET
+    response:
+      status_code: 200
+      body:
+        instance_status: "NOT IN EFFECT"
+        has_been_deleted: True
+
   - name: instance list 200 but no instance
     request:
       url: http://localhost:10000/a1-p/policytypes/20000/policies
@@ -367,6 +381,16 @@
     response:
       status_code: 202
 
+  - name: test the delay status get, not in effect yet
+    request:
+      url: http://localhost:10000/a1-p/policytypes/20001/policies/delaytest/status
+      method: GET
+    response:
+      status_code: 200
+      body:
+        instance_status: "NOT IN EFFECT"
+        has_been_deleted: False
+
   - name: test the delay policy get
     request:
       url: http://localhost:10000/a1-p/policytypes/20001/policies/delaytest
@@ -376,16 +400,6 @@
       body:
         test: foo
 
-  - name: test the delay status get
-    max_retries: 3
-    delay_before: 6  # give it a few seconds for rmr ; delay reciever sleeps for 5 seconds by default
-    request:
-      url: http://localhost:10000/a1-p/policytypes/20001/policies/delaytest/status
-      method: GET
-    response:
-      status_code: 200
-      # tavern doesn't let you check non json yet!
-
   - name: instance list 200 and there
     request:
       url: http://localhost:10000/a1-p/policytypes/20001/policies
@@ -395,6 +409,55 @@
       body:
        - delaytest
 
+  - name: test the delay status get
+    max_retries: 3
+    delay_before: 6  # give it a few seconds for rmr ; delay reciever sleeps for 5 seconds by default
+    request:
+      url: http://localhost:10000/a1-p/policytypes/20001/policies/delaytest/status
+      method: GET
+    response:
+      status_code: 200
+      body:
+        instance_status: "IN EFFECT"
+        has_been_deleted: False
+
+  # DELETE the instance and make sure subsequent GETs return properly
+  - name: delete the instance
+    request:
+      url: http://localhost:10000/a1-p/policytypes/20001/policies/delaytest
+      method: DELETE
+    response:
+      status_code: 202
+
+  - name: test the delay status get immediately
+    request:
+      url: http://localhost:10000/a1-p/policytypes/20001/policies/delaytest/status
+      method: GET
+    response:
+      status_code: 200
+      body:
+        instance_status: "IN EFFECT"
+        has_been_deleted: True
+
+  - name: test the delay status get after delay but before timers
+    delay_before: 7
+    request:
+      url: http://localhost:10000/a1-p/policytypes/20001/policies/delaytest/status
+      method: GET
+    response:
+      status_code: 200
+      body:
+        instance_status: "NOT IN EFFECT"
+        has_been_deleted: True
+
+  - name: test the delay status get after delay and after the timers
+    delay_before: 7
+    request:
+      url: http://localhost:10000/a1-p/policytypes/20001/policies/delaytest/status
+      method: GET
+    response:
+      status_code: 404
+
 ---
 
 test_name: test bad routing file endpoint
diff --git a/tests/test_controller.py b/tests/test_controller.py
index 756a0d0..26e6e63 100644
--- a/tests/test_controller.py
+++ b/tests/test_controller.py
@@ -1,3 +1,6 @@
+"""
+tests for controller
+"""
 # ==================================================================================
 #       Copyright (c) 2019 Nokia
 #       Copyright (c) 2018-2019 AT&T Intellectual Property.
@@ -91,6 +94,140 @@
     monkeypatch.setattr("rmr.rmr.generate_and_set_transaction_id", fake_set_transactionid)
 
 
+def _no_ac(client):
+    # no type there yet
+    res = client.get(ADM_CTRL_TYPE)
+    assert res.status_code == 404
+
+    # no types at all
+    res = client.get("/a1-p/policytypes")
+    assert res.status_code == 200
+    assert res.json == []
+
+    # instance 404 because type not there yet
+    res = client.get(ADM_CTRL_POLICIES)
+    assert res.status_code == 404
+
+
+def _put_ac_type(client, typedef):
+    _no_ac(client)
+
+    # put the type
+    res = client.put(ADM_CTRL_TYPE, json=typedef)
+    assert res.status_code == 201
+
+    # cant replace types
+    res = client.put(ADM_CTRL_TYPE, json=typedef)
+    assert res.status_code == 400
+
+    # type there now
+    res = client.get(ADM_CTRL_TYPE)
+    assert res.status_code == 200
+    assert res.json == typedef
+
+    # type in type list
+    res = client.get("/a1-p/policytypes")
+    assert res.status_code == 200
+    assert res.json == [20000]
+
+    # instance 200 but empty list
+    res = client.get(ADM_CTRL_POLICIES)
+    assert res.status_code == 200
+    assert res.json == []
+
+
+def _delete_ac_type(client):
+    res = client.delete(ADM_CTRL_TYPE)
+    assert res.status_code == 204
+
+    # cant get
+    res = client.get(ADM_CTRL_TYPE)
+    assert res.status_code == 404
+
+    # cant invoke delete on it again
+    res = client.delete(ADM_CTRL_TYPE)
+    assert res.status_code == 404
+
+    _no_ac(client)
+
+
+def _put_ac_instance(client, monkeypatch, instancedef):
+    # no instance there yet
+    res = client.get(ADM_CTRL_INSTANCE)
+    assert res.status_code == 404
+    res = client.get(ADM_CTRL_INSTANCE_STATUS)
+    assert res.status_code == 404
+
+    # create a good instance
+    _test_put_patch(monkeypatch)
+    res = client.put(ADM_CTRL_INSTANCE, json=instancedef)
+    assert res.status_code == 202
+
+    # replace is allowed on instances
+    res = client.put(ADM_CTRL_INSTANCE, json=instancedef)
+    assert res.status_code == 202
+
+    # instance 200 and in list
+    res = client.get(ADM_CTRL_POLICIES)
+    assert res.status_code == 200
+    assert res.json == [ADM_CTRL]
+
+
+def _delete_instance(client):
+    # cant delete type until there are no instances
+    res = client.delete(ADM_CTRL_TYPE)
+    assert res.status_code == 400
+
+    # delete it
+    res = client.delete(ADM_CTRL_INSTANCE)
+    assert res.status_code == 202
+
+    # should be able to do multiple deletes until it's actually gone
+    res = client.delete(ADM_CTRL_INSTANCE)
+    assert res.status_code == 202
+
+
+def _instance_is_gone(client, seconds_to_try=10):
+    for _ in range(seconds_to_try):
+        # idea here is that we have to wait for the seperate thread to process the event
+        try:
+            res = client.get(ADM_CTRL_INSTANCE_STATUS)
+            assert res.status_code == 404
+        except AssertionError:
+            time.sleep(1)
+
+    res = client.get(ADM_CTRL_INSTANCE_STATUS)
+    assert res.status_code == 404
+
+    # list still 200 but no instance
+    res = client.get(ADM_CTRL_POLICIES)
+    assert res.status_code == 200
+    assert res.json == []
+
+    # cant get instance
+    res = client.get(ADM_CTRL_INSTANCE)
+    assert res.status_code == 404
+
+
+def _verify_instance_and_status(client, expected_instance, expected_status, expected_deleted, seconds_to_try=5):
+    # get the instance
+    res = client.get(ADM_CTRL_INSTANCE)
+    assert res.status_code == 200
+    assert res.json == expected_instance
+
+    for _ in range(seconds_to_try):
+        # idea here is that we have to wait for the seperate thread to process the event
+        res = client.get(ADM_CTRL_INSTANCE_STATUS)
+        assert res.status_code == 200
+        assert res.json["has_been_deleted"] == expected_deleted
+        try:
+            assert res.json["instance_status"] == expected_status
+            return
+        except AssertionError:
+            time.sleep(1)
+    assert res.json["instance_status"] == expected_status
+
+
 # Module level Hack
 
 
@@ -107,122 +244,77 @@
 # Actual Tests
 
 
-def test_workflow_nothing_there_yet(client):
-    """ test policy put good"""
-    # no type there yet
-    res = client.get(ADM_CTRL_TYPE)
-    assert res.status_code == 404
-
-    # no types at all
-    res = client.get("/a1-p/policytypes")
-    assert res.status_code == 200
-    assert res.json == []
-
-    # instance 404 because type not there yet
-    res = client.get(ADM_CTRL_POLICIES)
-    assert res.status_code == 404
-
-
 def test_workflow(client, monkeypatch, adm_type_good, adm_instance_good):
     """
     test a full A1 workflow
     """
-    # put the type
-    res = client.put(ADM_CTRL_TYPE, json=adm_type_good)
-    assert res.status_code == 201
+    _put_ac_type(client, adm_type_good)
+    _put_ac_instance(client, monkeypatch, adm_instance_good)
 
-    # cant replace types
-    res = client.put(ADM_CTRL_TYPE, json=adm_type_good)
-    assert res.status_code == 400
-
-    # type there now
-    res = client.get(ADM_CTRL_TYPE)
-    assert res.status_code == 200
-    assert res.json == adm_type_good
-    res = client.get("/a1-p/policytypes")
-    assert res.status_code == 200
-    assert res.json == [20000]
-
-    # instance 200 but empty list
-    res = client.get(ADM_CTRL_POLICIES)
-    assert res.status_code == 200
-    assert res.json == []
-
-    # no instance there yet
-    res = client.get(ADM_CTRL_INSTANCE)
-    assert res.status_code == 404
-    res = client.get(ADM_CTRL_INSTANCE_STATUS)
-    assert res.status_code == 404
-
-    # create a good instance
-    _test_put_patch(monkeypatch)
-    res = client.put(ADM_CTRL_INSTANCE, json=adm_instance_good)
-    assert res.status_code == 202
-
-    # replace is allowed on instances
-    res = client.put(ADM_CTRL_INSTANCE, json=adm_instance_good)
-    assert res.status_code == 202
-
-    # instance 200 and in list
-    res = client.get(ADM_CTRL_POLICIES)
-    assert res.status_code == 200
-    assert res.json == [ADM_CTRL]
-
-    def get_instance_good(expected):
-        # get the instance
-        res = client.get(ADM_CTRL_INSTANCE)
-        assert res.status_code == 200
-        assert res.json == adm_instance_good
-
-        # get the instance status
-        res = client.get(ADM_CTRL_INSTANCE_STATUS)
-        assert res.status_code == 200
-        assert res.get_data(as_text=True) == expected
+    """
+    we test the state transition diagram of all 5 states here;
+    1. not in effect, not deleted
+    2. in effect, not deleted
+    3. in effect, deleted
+    4. not in effect, deleted
+    5. gone (timeout expires)
+    """
 
     # try a status get but we didn't get any ACKs yet to test NOT IN EFFECT
-    time.sleep(1)  # wait for the rmr thread
-    get_instance_good("NOT IN EFFECT")
+    _verify_instance_and_status(client, adm_instance_good, "NOT IN EFFECT", False)
 
     # now pretend we did get a good ACK
     a1rmr.replace_rcv_func(_fake_dequeue)
-    time.sleep(1)  # wait for the rmr thread
-    get_instance_good("IN EFFECT")
+    _verify_instance_and_status(client, adm_instance_good, "IN EFFECT", False)
 
-    # cant delete type until there are no instances
-    res = client.delete(ADM_CTRL_TYPE)
-    assert res.status_code == 400
-
-    # delete it
-    res = client.delete(ADM_CTRL_INSTANCE)
-    assert res.status_code == 202
-    res = client.delete(ADM_CTRL_INSTANCE)  # should be able to do multiple deletes
-    assert res.status_code == 202
+    # delete the instance
+    _delete_instance(client)
 
     # status after a delete, but there are no messages yet, should still return
-    time.sleep(1)  # wait for the rmr thread
-    get_instance_good("IN EFFECT")
+    _verify_instance_and_status(client, adm_instance_good, "IN EFFECT", True)
 
     # now pretend we deleted successfully
     a1rmr.replace_rcv_func(_fake_dequeue_deleted)
-    time.sleep(1)  # wait for the rmr thread
-    # list still 200 but no instance
-    res = client.get(ADM_CTRL_POLICIES)
-    assert res.status_code == 200
-    assert res.json == []
-    res = client.get(ADM_CTRL_INSTANCE_STATUS)  # cant get status
-    assert res.status_code == 404
-    res = client.get(ADM_CTRL_INSTANCE)  # cant get instance
-    assert res.status_code == 404
+
+    # status should be reflected first (before delete triggers)
+    _verify_instance_and_status(client, adm_instance_good, "NOT IN EFFECT", True)
+
+    # instance should be totally gone after a few seconds
+    _instance_is_gone(client)
 
     # delete the type
-    res = client.delete(ADM_CTRL_TYPE)
-    assert res.status_code == 204
+    _delete_ac_type(client)
 
-    # cant touch this
-    res = client.get(ADM_CTRL_TYPE)
-    assert res.status_code == 404
-    res = client.delete(ADM_CTRL_TYPE)
-    assert res.status_code == 404
+
+def test_cleanup_via_t1(client, monkeypatch, adm_type_good, adm_instance_good):
+    """
+    create a type, create an instance, but no acks ever come in, delete instance
+    """
+    _put_ac_type(client, adm_type_good)
+
+    a1rmr.replace_rcv_func(_fake_dequeue_none)
+
+    _put_ac_instance(client, monkeypatch, adm_instance_good)
+
+    """
+    here we test the state transition diagram when it never goes into effect:
+    1. not in effect, not deleted
+    2. not in effect, deleted
+    3. gone (timeout expires)
+    """
+
+    _verify_instance_and_status(client, adm_instance_good, "NOT IN EFFECT", False)
+
+    # delete the instance
+    _delete_instance(client)
+
+    _verify_instance_and_status(client, adm_instance_good, "NOT IN EFFECT", True)
+
+    # instance should be totally gone after a few seconds
+    _instance_is_gone(client)
+
+    # delete the type
+    _delete_ac_type(client)
 
 
 def test_bad_instances(client, monkeypatch, adm_type_good):
@@ -248,7 +340,6 @@
 
     # get a non existent instance
     a1rmr.replace_rcv_func(_fake_dequeue)
-    time.sleep(1)
     res = client.get(ADM_CTRL_INSTANCE + "DARKNESS")
     assert res.status_code == 404
 
diff --git a/tox-integration.ini b/tox-integration.ini
index e39a8cb..4c25f71 100644
--- a/tox-integration.ini
+++ b/tox-integration.ini
@@ -48,7 +48,7 @@
     pytest --tavern-beta-new-traceback
     echo "running ab"
 # run apache bench
-    ab -n 100 -c 10 -u putdata -T application/json http://localhost:10000/a1-p/policytypes/20000/policies/admission_control_policy
+    ab -n 100 -c 10 -v 4 http://localhost:10000/a1-p/healthcheck
 commands_post=
 #    echo "log collection"
 #    integration_tests/getlogs.sh
diff --git a/tox.ini b/tox.ini
index f6d1800..9606882 100644
--- a/tox.ini
+++ b/tox.ini
@@ -25,8 +25,9 @@
     pytest-cov
 setenv =
     LD_LIBRARY_PATH = /usr/local/lib/:/usr/local/lib64
-    RMR_RCV_RETRY_INTERVAL = 1
-    RMR_RETRY_TIMES = 2
+    A1_RMR_RETRY_TIMES = 2
+    INSTANCE_DELETE_NO_RESP_TTL = 3
+    INSTANCE_DELETE_RESP_TTL = 3
 
 # Note, before this will work, for the first time on that machine, run ./install_deps.sh
 commands =
@@ -41,7 +42,7 @@
 commands = flake8 setup.py a1 tests
 
 [flake8]
-extend-ignore = E501,E741
+extend-ignore = E501,E741,E731
 
 # verbatim as asked for by the docs instructions: https://wiki.o-ran-sc.org/display/DOC/Configure+Repo+for+Documentation
 [testenv:docs]