python_vpn_audit/vpn_audit/vpn_mongo.py

from pymongo import InsertOne, DeleteMany, ReplaceOne, UpdateOne, UpdateMany
from bson.json_util import dumps, loads
import logging
from netaddr import valid_ipv4
import re

# pymongo pymongo-4.3.3 has issues with the older version of mongo installed on rstlcnscmgd01.open.corp.tnsi.com, use an older library
# pymongo.errors.ConfigurationError: Server at rstlcnscmgd01.open.corp.tnsi.com:27017 reports wire version 3, but this version of PyMongo requires at least 6 (MongoDB 3.6).
# pip install pymongo==3.13.0

def document_ids(collection, query={}, query_modifier={}):
    ## get all document ids in a collection
    # device_ids = document_ids(mycollection)

    ## get all contactable cisco device ids
    # device_ids = document_ids(mycollection, { "session_protocol" : "ssh", "vendor": "cisco" })

    ## get all contactable cisco device ids where device type is IP-VPNAGG using a query modifier
    # device_ids = document_ids(mycollection, { "session_protocol" : "ssh", "vendor": "cisco" }, { "DeviceType": "IP-VPNAGG" })
    if len(query_modifier) >0:
      query.update(query_modifier)
    result = [d for d in collection.distinct('_id', query)]
    return result

def device_names(collection, query_modifier = {}, device_name_list = []):
    if len(device_name_list) >0:
      query = { "DeviceName" : { "$in" : device_name_list } }
    else:
      query = {}
    if len(query_modifier) >0:
      query.update(query_modifier)
    projection = {"_id": 1, "DeviceName": 1}
    result = {d['DeviceName']:d['_id'] for d in collection.find(query, projection)}
    return result

#def deduplicate_collection(collection, mode='list', ignore_schema_keys=['_id'], required_schema_keys=[]):
def deduplicate_collection(**kwargs):
    ## check params
    required_args = ['collection']
    missing_args = [arg for arg in required_args if arg not in kwargs.keys()]
    if len(missing_args) >0:
      print(f'{deduplicate_collection.__name__} missing arguments {missing_args}')
      quit()
    collection = kwargs['collection']
    mode = kwargs['mode'] if 'mode' in kwargs.keys() else 'list'
    ignore_schema_keys = kwargs['ignore_schema_keys'] if 'ignore_schema_keys' in kwargs.keys() else ['_id']
    required_schema_keys = kwargs['required_schema_keys'] if 'required_schema_keys' in kwargs.keys() else []
    logger_name = kwargs['logger_name'] if 'logger_name' in kwargs.keys() else 'main'
    logger = logging.getLogger(logger_name)

    ## what does this func do?
    #
    ## dedupe all documents with exactly matching schemas and keys, required_schema_keys=[] must be an empty list
    #
    #    # this example will find all exactly matching documents with the exact same keys and key values excluding the unique attribute '_id', then remove duplicates
    #    mode = 'list' / 'show' / 'delete'
    #    ignore_schema_keys = ['_id']
    #    deduplicate_collection(collection = collection, mode = mode, ignore_schema_keys = ignore_schema_keys)
    #
    #    Or
    #
    ## dedupe documents where smaller documents are a subset of larger documents with matching key values, described by the schema in required_schema_keys=[]
    #    keeps the largest document(s), discards the smaller document(s)
    #    small document: ["local_ip", "p1_ivrf", "peer_ip", "p1_dh_group", "p1_encr_algo", "p1_hash_algo", "p1_auth_type", "p1_status", "local_port", "crypto_session_interface", "peer_port", "p2_fvrf", "peer_vpn_id"]
    #    large document: ['local_ip', 'p1_ivrf', 'peer_ip', 'p1_dh_group', 'p1_encr_algo', 'p1_hash_algo', 'p1_auth_type', 'p1_status', 'local_port', 'ipsec_flow', 'crypto_session_interface', 'peer_port', 'p2_fvrf', 'peer_vpn_id', 'p2_interface', 'protected_vrf', 'pfs', 'p2_encr_algo', 'p2_hash_algo', 'p2_status', 'crypto_map', 'RRI_enabled', 'transform_sets', 'default_p2_3des', 'last_modified']
    #
    #    # this example shows the state of an idle tunnel, when it is up the phase2 and crypto session information is present, when the scrape is rerun it may capture the tunnel in either state, potentially creating two distinct documents
    #    mode = 'list' / 'show' / 'delete'
    #    ignore_schema_keys = ['_id', 'last_modified', 'session_status']
    #    idle_connection = ["local_ip", "p1_ivrf", "peer_ip", "p1_dh_group", "p1_encr_algo", "p1_hash_algo", "p1_auth_type", "p1_status", "local_port", "crypto_session_interface", "peer_port", "p2_fvrf", "peer_vpn_id"]
    #    deduplicate_collection(collection = collection, mode = mode, ignore_schema_keys = ignore_schema_keys, required_schema_keys = idle_connection)
    #
    ## the ignore_schema_keys=[] list masks keys from the search query that would otherwise identify documents as unique, at a miniumum the '_id' key must be excluded

    ## init
    delete_object_ids = []
    document_schemas = []
    partial_dedupe_document_key_count = 0

    ## select dynamic dedupe matching exact document keys or partial dedupe matching some document keys
    if len(required_schema_keys) >0:
      partial_dedupe = True
      operation = 'operation = deduplication of all documents as subsets of larger documents containing matching schemas and key values'
      partial_dedupe_document_key_count = len(ignore_schema_keys) + len(required_schema_keys)
    else:
      partial_dedupe = False
      operation = 'operation = deduplication of all documents with matching schemas and key values'

    ## return all documents in collection to find all keys in collection and all distinct document schemas
    # inefficient, with later versions of mongo this can be achieved within the query language
    # https://stackoverflow.com/questions/2298870/get-names-of-all-keys-in-the-collection
    result = collection.find()
    for i in result:
      keys = [k for k in i.keys() if k not in ignore_schema_keys]
      if keys not in document_schemas:
        document_schemas.append(keys)
    # print(f'available schemas\n{document_schemas}')

    ## get all the schema keys in collection, this is used to mask keys with exact document key matching
    all_schema_keys = list(set(sum(document_schemas, [])))
    # print(f'all schema keys\n{all_schema_keys}')

    # override document_schemas with a single document schema from required_schema_keys
    if partial_dedupe:
      document_schemas = [required_schema_keys]

    ## find duplicate documents per schema
    for schema in document_schemas:
      ## get all _id schema keys used in aggregate query to match duplicate documents
      id_keys = {k:f'${k}' for k in schema}
      include_keys = {k:{ "$exists": True } for k in schema}

      ## merge include keys {'$exists': True} + exclude keys{'$exists': False} for the first aggregate $match filter to ensure only records with the exact same keys as schema are matched
      # find all keys in all_schema_keys not in this document schema
      exclude_keys_list = list(set(all_schema_keys) - set(schema))
      # exclude_keys_list.append('test_key')
      exclude_keys = {k:{ "$exists": False} for k in exclude_keys_list}
      mask_keys = include_keys.copy()
      mask_keys.update(exclude_keys)
      # print(f'document match query for this schema\n{mask_keys}')

      # ## debug
      # print('\n')
      # print(f'document schema\n{schema}')
      # print(f'mask documents with required keys only\n{include_keys}')
      # print(f'mask documents with exact keys\n{mask_keys}')
      # print(f'search documents with these keys by matching value (should match schema)\n{mask_keys}')

      ## find duplicate documents with matching values for id_keys with schema mask_keys, or find one/more document(s) matching schema mask_keys
      if not partial_dedupe:
        match_count = 2
        query_keys = mask_keys
      else:
        match_count = 1
        query_keys = include_keys

      ## return the content of duplicate documents
      duplicates = collection.aggregate([
        {
          "$match": mask_keys
        },
        { "$group": {
            "_id": id_keys,
            "count": {"$sum": 1}
          }
        },
        { "$match": {
            "count": {"$gte": match_count}
          }
        },
        { "$sort": {
            "count": -1
          }
        }
      ])
      # print(dumps(duplicates, indent=4))

      ## loop duplicate document content, aggregate search using schema keys mask and get document object_ids for deletion
      for duplicate_document in duplicates:
        query = {k:v for k, v in duplicate_document['_id'].items()}
        # print(query)
        filtered_result = collection.aggregate([
          {
            "$match": query_keys
          },
          {
            "$match": query
          },
        ])
        #print(dumps(filtered_result, indent=4))
        #print(len(dumps(filtered_result, indent=4)))

        if not partial_dedupe:
          ## get document ids of exactly matching documents
          object_ids = [r['_id'] for r in filtered_result]
          ## remove the first duplicate document_id, this will be the only remaining document
          object_ids.pop(0)
          # print(object_ids)
          delete_object_ids.extend(object_ids)
        else:
          preserve_document_ids = []
          query_result = [r for r in filtered_result]
          query_result_ids = [r['_id'] for r in query_result]
          if len(query_result) >1:
            for d in query_result:
              if len(d.keys()) > partial_dedupe_document_key_count:
                preserve_document_ids.append(d['_id'])

            ## this is too much logic, this can be done calling the function again for full deduplication, if date evaluation is needed another function should be written more targetted towards the use case
            ## keep this logic for the 'previous/current_configuration' key generation and the date range selection for report generation
            #
            # query_result.sort(key=len, reverse=True)
            # largest_document_len = len(query_result[0].keys())
            # largest_documents = [d for d in query_result if len(d.keys()) == largest_document_len]
            # # print(dumps(largest_documents, indent=4))
            # latest_date = datetime.datetime.min
            # latest_document = {}
            # for l in largest_documents:
            #   if 'last_modified' in l:
            #     if l['last_modified'] > latest_date:
            #       latest_date = l['last_modified']
            #       latest_document.update(l)
            # if len(latest_document.keys()) >0:
            #   preserve_document_ids.append(latest_document['_id'])
            # else:
            #   preserve_document_ids.append(largest_documents[0]['_id'])

          elif len(query_result) == 1:
            preserve_document_ids.append(query_result[0]['_id'])
          ## find document ids to remove
          remove_document_ids = [i for i in query_result_ids if i not in preserve_document_ids]
          delete_object_ids.extend(remove_document_ids)

          # ## debug
          # print('\n')
          # print(f'all documents {query_result_ids}')
          # print(f'remove documents {remove_document_ids}')
          # print(f'keep documents {preserve_document_ids}')

    ## get unique document_ids
    delete_object_ids = list(set(delete_object_ids))

    ## list object_ids of duplicate records
    if mode == 'list':
      # print(operation)
      # print(f'mode = {mode}\n')
      # print(f'object_ids to delete\n{delete_object_ids}')
      logger.info(f'{operation}')
      logger.info(f'mode = {mode}')
      logger.info(f'object_ids to delete:')
      logger.info(f'{delete_object_ids}')

    ## show duplicate records
    if mode == 'show':
      # print(operation)
      # print(f'mode = {mode}\n')
      # query = { "_id" : { "$in" : delete_object_ids } }
      # result = collection.find(query)
      # print('documents to delete')
      # for r in result:
      #   print(r)
      # print(f'\ndocument ids to delete\n{delete_object_ids}')
      logger.info(f'{operation}')
      logger.info(f'mode = {mode}')
      query = { "_id" : { "$in" : delete_object_ids } }
      result = collection.find(query)
      logger.info('documents to delete:')
      logger.info(f'{result}')
      logger.info(f'document ids to delete:')
      logger.info(f'{delete_object_ids}')

    ## remove duplicate documents
    if mode == 'delete':
      if len(delete_object_ids) >0:
        requests = [DeleteMany({ "_id": { "$in": delete_object_ids } })]
        result = collection.bulk_write(requests)
        # print(result.bulk_api_result)
        # logger.info(f'{result.bulk_api_result}')
        logger.info(f'Database: inserted_count {result.inserted_count} upserted_count {result.upserted_count} matched_count {result.matched_count} modified_count {result.modified_count} deleted_count {result.deleted_count}')
        return result
      else:
        logger.info('Database: no operations processed - no duplicates matched')

# def merge_to_collection(src_collection, dst_collection, ignore_src_schema_keys = ['_id'], exclude_dst_schema_keys = ['_id'], additonal_dst_schema_keypairs = [], match_src_schema_keys = []):
def merge_to_collection(**kwargs):
    ## check params
    required_args = ['src_collection', 'dst_collection']
    missing_args = [arg for arg in required_args if arg not in kwargs.keys()]
    if len(missing_args) >0:
      print(f'{deduplicate_collection.__name__} missing arguments {missing_args}')
      quit()
    src_collection = kwargs['src_collection']
    dst_collection = kwargs['dst_collection']
    ignore_src_schema_keys = kwargs['ignore_src_schema_keys'] if 'ignore_src_schema_keys' in kwargs.keys() else ['_id']
    exclude_dst_schema_keys = kwargs['exclude_dst_schema_keys'] if 'exclude_dst_schema_keys' in kwargs.keys() else ['_id']
    additonal_dst_schema_keypairs = kwargs['additonal_dst_schema_keypairs'] if 'additonal_dst_schema_keypairs' in kwargs.keys() else []
    match_src_schema_keys = kwargs['match_src_schema_keys'] if 'match_src_schema_keys' in kwargs.keys() else []
    logger_name = kwargs['logger_name'] if 'logger_name' in kwargs.keys() else 'main'
    logger = logging.getLogger(logger_name)

    ## what does this func do?
    #
    # merge new or update existing documents from a source collection to a destination collection
    #   can ommit document keys from the destination document search
    #   can add/remove key pair values to the new/existing document destined for the destination collection
    # caution
    #   the dst collection match can be too broad, returning many documents however the update is targetted to a single document (UpdateOne vs UpdateMany)
    #   there is not much logic here and the function relies on prior deduplication
    #
    # find documents in src_collection matching ANY or match_src_schema_keys if present
    # for each src document remove keys in ignore_src_schema_keys, this will be the match query for the dst_collection (at a minimum this must contain '_id' but may contain more, such as the dynamic 'crypto_map_interface' key)
    # match documents in the dst_collection (this should return 0 or 1 matching documents as the 'device' and 'device['temp']' collections have been deduplicated)
    # merge the dst document with the src document, effectively updating the dst document - if no dst document is present in the dst collection a new dst document is created
    # remove keys from the new dst document listed in exclude_dst_schema_keys (at a minimum this should contain key '_id' to allow the dst collection to write its own index for the document)
    # add additional keys to the new dst document (this is optional, in this case the addition of 'last_modified' date stamp is used to identify current tunnel records)
    # update/upsert the new dst document

    requests = []
    if len(match_src_schema_keys) >0:
      query = {k:{ "$exists": True } for k in match_src_schema_keys}
      src_result = src_collection.find(query)
    else:
      src_result = src_collection.find()
    for r in src_result:
      src_document = r
      filter = {k:v for k, v in r.items() if k not in ignore_src_schema_keys} # example: we dont want crypto_map_interface when matching documents to merge into
      dst_match_count = dst_collection.count_documents(filter)
      if dst_match_count >0:
        dst_result = dst_collection.find(filter)
        for dst_match in dst_result:
          dst_id = dst_match['_id']
          # merge src document fields with dst document, overwrite dst key/value pairs
          dst_document = {**dst_match, **src_document} # z = {**x, **y} - merge dicts, y replaces x
          for exclude in exclude_dst_schema_keys:
            if exclude in dst_document:
              dst_document.pop(exclude)
          if len(additonal_dst_schema_keypairs) >0:
            for kvp in additonal_dst_schema_keypairs:
              dst_document.update(kvp)
          requests.append(UpdateOne({'_id': dst_id}, {'$set': dst_document}, upsert=True))
      else:
        dst_document = src_document
        for exclude in exclude_dst_schema_keys:
          if exclude in dst_document:
            dst_document.pop(exclude)
        if len(additonal_dst_schema_keypairs) >0:
          for kvp in additonal_dst_schema_keypairs:
            dst_document.update(kvp)
        requests.append(InsertOne(dst_document))

    if len(requests) >0:
      result = dst_collection.bulk_write(requests)
      # print(result.bulk_api_result)
      # logger.info(f'{result.bulk_api_result}')
      logger.info(f'Database: inserted_count {result.inserted_count} upserted_count {result.upserted_count} matched_count {result.matched_count} modified_count {result.modified_count} deleted_count {result.deleted_count}')
      return result
    else:
      logger.info('Database: no operations processed - no upsert or insert requests')

## debug function
def diff_collection(src_collection, dst_collection, mode = 'stat', ignore_src_schema_keys = ['_id'], match_src_schema_keys = []):
    print(f'{src_collection.full_name} documents merged into {dst_collection.full_name}')
    # init
    src_doc_count = src_collection.count_documents({})
    dst_doc_match_count = 0
    dst_doc_unmatch_count = 0
    unmatched_documents = []
    # get all documents in the src_collection
    src_result = src_collection.find()
    find_keys = src_result.clone()
    # get all keys in a collection to build query mask
    src_collection_keys = []
    for d in find_keys:
      for key in d.keys():
        src_collection_keys.append(key)
    src_collection_keys = [k for k in list(set(src_collection_keys)) if k not in ignore_src_schema_keys]
    for r in src_result:
      # mangle src_document for use as an exact match query on the dst_collection
      query = {k:v for k, v in r.items() if k not in ignore_src_schema_keys}
      mask = {k:{ "$exists": False } for k in src_collection_keys if k not in query.keys()}
      query.update(mask)
      # search dst_collection for the src_document
      dst_match_count = dst_collection.count_documents(query)
      dst_doc_match_count += dst_match_count # this isnt accurate owing to subset records in the src_collection
      if dst_match_count == 0:
        dst_doc_unmatch_count += 1
        if mode == 'show':
          unmatched_documents.append(r)
    if len(unmatched_documents) >0:
      print('documents did not make it to the dst_collection')
      for d in unmatched_documents:
        missing_keys = [k for k in match_src_schema_keys if k not in d.keys()]
        if len(missing_keys) == 0:
          print('error detected, document contains required keys')
        else:
          print(f'\ndocument missing required keys {missing_keys}')
        print(f'{dumps(d)}')
      print()
    print(f'src_doc_count {src_doc_count}')
    print(f'dst_doc_match_count {dst_doc_match_count}')
    print(f'dst_doc_unmatch_count {dst_doc_unmatch_count}')
    if (dst_doc_match_count + dst_doc_unmatch_count) != src_doc_count:
      print("error detected, set mode = 'show' to highlight rogue documents")
    print('\n')

def spoke_lookup(**kwargs):
    ## check params
    required_args = ['read_device_collection', 'read_ip_collection', 'write_collection']
    missing_args = [arg for arg in required_args if arg not in kwargs.keys()]
    if len(missing_args) >0:
      print(f'{spoke_lookup.__name__} missing arguments {missing_args}')
      quit()
    read_device_collection = kwargs['read_device_collection']
    read_ip_collection = kwargs['read_ip_collection']
    write_collection = kwargs['write_collection']
    logger_name = kwargs['logger_name'] if 'logger_name' in kwargs.keys() else 'main'
    logger = logging.getLogger(logger_name)

    ## init
    # +'peer_vpn_id', +'nhrp_nexthop' = interface lookup using 'nhrp_nexthop' ip - get +'DeviceRecNum', +'DeviceName' (high confidence, nhrp map has an immutable 1:1 mapping)
    # +'peer_vpn_id', -'nhrp_nexthop' = interface lookup using 'peer_vpn_id' ip - get +'DeviceRecNum', +'DeviceName' (fairly high confidence, engineer built tunnel and elected to match the handshake attribute 'peer_vpn_id' with the spoke device address)
    # +'peer_vpn_id', -'nhrp_nexthop' = device lookup using 'peer_vpn_id' name - get +'DeviceRecNum' (fairly high confidence, engineer built tunnel and elected to match the handshake attribute 'peer_vpn_id' with the spoke device name)
    # +'peer_ip', -ips_not_in_other_lists = interface lookup using 'peer_ip' ip - get +'DeviceRecNum', +'DeviceName' (less confidence (catch all), finds half negotiated tunnels and tunnels with public ips thus not likely in mongo(remedy))
    # +'peer_ip', -'DeviceName' = interface lookup using 'peer_ip' - get +'DeviceRecNum', +'DeviceName' (any spoke not matched by previous queries, there is not high confidence in this method thus it must be a dependent(-'DeviceName') and final query)

    ## get spoke attributes by ('nhrp_nexthop' ip) / ('peer_vpn_id' ip) / ('peer_vpn_id' name) for lookup in NMS mongo(remedy) device tickets

    # search nhrp ips
    nhrp_query = {'peer_vpn_id':{ "$exists": True }, 'nhrp_nexthop':{ "$exists": True }}
    nhrp_projection = {"_id": 0, "nhrp_nexthop": 1}
    nhrp_result = write_collection.find(nhrp_query, nhrp_projection)
    nhrp_ips = [ip['nhrp_nexthop'] for ip in nhrp_result]

    # search vpnid ips and names
    vpnid_query = {'peer_vpn_id':{ "$exists": True }, 'nhrp_nexthop':{ "$exists": False }}
    vpnid_projection = {"_id": 0, "peer_vpn_id": 1}
    vpnid_result = write_collection.find(vpnid_query, vpnid_projection)
    vpnid_items = [itm['peer_vpn_id'] for itm in vpnid_result]
    vpnid_ips = [ip for ip in vpnid_items if valid_ipv4(ip)]
    # remove suffix '-ps' from 'peer_vpn_id' for mongo 'DeviceName' lookup (bml0990-ps - bml0990)
    vpnid_names = [re.sub('(?i)-ps$', '', name) for name in vpnid_items if not valid_ipv4(name) and name not in ['none']]

    # search remining peer ips (likely public and not in mongo(remedy))
    all_peerip_ips = [d for d in write_collection.distinct('peer_ip')]
    negate_peerip_ips = list(set(nhrp_ips + vpnid_ips))
    peer_ips = [ip for ip in all_peerip_ips if ip not in negate_peerip_ips]

    ## search NMS mongo(remedy) for interface/device tickets using the spoke attributes

    # search interface collection, match interface ip to 'nhrp_nexthop' ip
    nhrp_ips_query = {"raw.CHR_IPAddress": {"$in": nhrp_ips}}
    nhrp_ips_projection = {"_id": 0, "raw.DeviceRecNum": 1, "raw.CHR_DeviceName": 1, "raw.CHR_IPAddress": 1}
    nhrp_ips_result = read_ip_collection.find(nhrp_ips_query, nhrp_ips_projection)

    # search interface collection, match interface ip to 'peer_vpn_id' ip
    vpnid_ips_query = {"raw.CHR_IPAddress": {"$in": vpnid_ips}}
    vpnid_ips_projection = {"_id": 0, "raw.DeviceRecNum": 1, "raw.CHR_DeviceName": 1, "raw.CHR_IPAddress": 1}
    vpnid_ips_result = read_ip_collection.find(vpnid_ips_query, vpnid_ips_projection)

    # search device collection, match device name to 'peer_vpn_id' name
    vpnid_names_query = {"raw.DeviceName": {"$in": vpnid_names}}
    vpnid_names_projection = {"_id": 0, "raw.DeviceRecNum": 1, "raw.DeviceName": 1}
    vpnid_names_result = read_device_collection.find(vpnid_names_query, vpnid_names_projection)

    # search interface collection, match interface ip to 'peer_ip' ip
    peer_ips_query = {"raw.CHR_IPAddress": {"$in": peer_ips}}
    peer_ips_projection = {"_id": 0, "raw.DeviceRecNum": 1, "raw.CHR_DeviceName": 1, "raw.CHR_IPAddress": 1}
    peer_ips_result = read_ip_collection.find(peer_ips_query, peer_ips_projection)

    ## db requests builder
    def db_requests(list, mode):
      requests = []
      for rec in list:
        if mode == 'nhrp_ip':
          filter = {'nhrp_nexthop': rec['raw']['CHR_IPAddress']}
          record = {'DeviceName': rec['raw']['CHR_DeviceName'], 'DeviceRecNum': rec['raw']['DeviceRecNum']}
        elif mode == 'vpn_ip':
          filter = {'peer_vpn_id': rec['raw']['CHR_IPAddress']}
          record = {'DeviceName': rec['raw']['CHR_DeviceName'], 'DeviceRecNum': rec['raw']['DeviceRecNum']}
        elif mode == 'vpn_id':
          filter = {'peer_vpn_id': re.compile(f'(?i).*({rec["raw"]["DeviceName"]}).*')}
          record = {'DeviceName': rec["raw"]["DeviceName"], 'DeviceRecNum': rec['raw']['DeviceRecNum']}
        elif mode == 'peer_ip':
          filter = {'peer_ip': rec['raw']['CHR_IPAddress']}
          record = {'DeviceName': rec['raw']['CHR_DeviceName'], 'DeviceRecNum': rec['raw']['DeviceRecNum']}
        if not record['DeviceRecNum']:
          devname_query = {"raw.DeviceName": record['DeviceName']}
          devname_projection = {"_id": 0, "raw.DeviceRecNum": 1}
          devname_result = read_device_collection.find_one(devname_query, devname_projection)
          try:
            record['DeviceRecNum'] = devname_result['raw']['DeviceRecNum']
          except:
            pass
        remove_keys = []
        for k, v in record.items():
          if not v:
            remove_keys.append(k)
        for i in remove_keys:
          record.pop(i)
        if len(record) >0:
          requests.append(UpdateMany(filter, {'$set': record}, upsert=False))
      return requests

    nhrp_ips_requests = db_requests(nhrp_ips_result, 'nhrp_ip')
    vpnid_ips_requests = db_requests(vpnid_ips_result, 'vpn_ip')
    vpnid_names_requests = db_requests(vpnid_names_result, 'vpn_id')
    peer_ips_requests = db_requests(peer_ips_result, 'peer_ip')
    requests = nhrp_ips_requests + vpnid_ips_requests + vpnid_names_requests + peer_ips_requests

    # for i in requests:
    #   logger.info(f'Request: {i}')

    ## write nhrp / vpn-id requests
    if len(requests) >0:
      result = write_collection.bulk_write(requests)
      logger.info(f'Database: inserted_count {result.inserted_count} upserted_count {result.upserted_count} matched_count {result.matched_count} modified_count {result.modified_count} deleted_count {result.deleted_count}')

    ## catch all, seldom seen in a run, may catch devices by 'peer_ip' not matched by the previous queries, likely many public ips matched not in mongo(remedy)
    peer_query = {'peer_ip':{ "$exists": True }, 'DeviceName': { "$exists": False}}
    peer_projection = {"_id": 0, "peer_ip": 1}
    peer_result = write_collection.find(peer_query, peer_projection)
    peer_ips = [ip['peer_ip'] for ip in peer_result]
    if len(peer_ips) >0:
      ## search NMS mongo(remedy) for interface collection using the spoke attribute 'peer_ip'
      peer_ips_query = {"raw.CHR_IPAddress": {"$in": peer_ips}}
      peer_ips_projection = {"_id": 0, "raw.DeviceRecNum": 1, "raw.CHR_DeviceName": 1, "raw.CHR_IPAddress": 1}
      peer_ips_result = read_ip_collection.find(peer_ips_query, peer_ips_projection)
      if peer_ips_result.count() >0:
        peer_ips_requests = db_requests(peer_ips_result, 'peer_ip')
        if len(peer_ips_requests) >0:
          result = write_collection.bulk_write(requests)
          logger.info("CatchAll: device ip not matched by 'nhrp_nexthop' / 'peer_vpn_id' and missed by 'peer_ip'")
          logger.info(f'Database: inserted_count {result.inserted_count} upserted_count {result.upserted_count} matched_count {result.matched_count} modified_count {result.modified_count} deleted_count {result.deleted_count}')

def device_ticket_lookup(**kwargs):
    ## check params
    required_args = ['read_device_collection', 'write_collection']
    missing_args = [arg for arg in required_args if arg not in kwargs.keys()]
    if len(missing_args) >0:
      print(f'{device_ticket_lookup.__name__} missing arguments {missing_args}')
      quit()
    read_device_collection = kwargs['read_device_collection']
    write_collection = kwargs['write_collection']
    logger_name = kwargs['logger_name'] if 'logger_name' in kwargs.keys() else 'main'
    logger = logging.getLogger(logger_name)

    ## init
    # this function depends on the 'DeviceRecNum' field populated by the 'spoke_lookup' function

    ## get all 'DeviceRecNum' in device 'temp' collection, lookup the 'DeviceRecNum' in NMS mongo(remedy) to return device attributes
    device_recnums = [d for d in write_collection.distinct('DeviceRecNum')]
    device_hardware_query = {'raw.DeviceRecNum': {'$in': device_recnums}}
    device_hardware_projection = {"_id": 0, 'raw.DeviceRecNum': 1, 'raw.Manufacturer': 1, 'raw.Model': 1}
    device_hardware = read_device_collection.find(device_hardware_query, device_hardware_projection)

    ## update device records in device 'temp' collection
    requests = []
    for rec in device_hardware:
      filter = {'DeviceRecNum': rec['raw']['DeviceRecNum']}
      record = {'Manufacturer': rec['raw']['Manufacturer'], 'Model': rec['raw']['Model']}
      remove_keys = []
      for k, v in record.items():
        if not v:
          remove_keys.append(k)
      for i in remove_keys:
        record.pop(i)
      if len(record) >0:
        requests.append(UpdateMany(filter, {'$set': record}, upsert=False))

    # for i in requests:
    #   logger.info(f'Request: {i}')

    ## write requests
    if len(requests) >0:
      result = write_collection.bulk_write(requests)
      logger.info(f'Database: inserted_count {result.inserted_count} upserted_count {result.upserted_count} matched_count {result.matched_count} modified_count {result.modified_count} deleted_count {result.deleted_count}')

# ## origional deduplicate_collection, reference, finds all distinct document schemas and searches duplicates on exact schema keys and values, the newer version also removes documents with specific subset of keys where a larger document exists (idle tunnels)
# def deduplicate_collection(collection, mode='list', ignore_schema_keys=['_id']):
#     #### dedupe documents with exactly matching schemas
#     ## get all unique document schemas in collection
#     # this pulls all documents in collection, inefficient, with later versions of mongo this can be achieved within the query language
#     # https://stackoverflow.com/questions/2298870/get-names-of-all-keys-in-the-collection
#     delete_object_ids = []
#     document_schemas = []
#     result = collection.find()
#     for i in result:
#       keys = [k for k in i.keys() if k not in ignore_schema_keys]
#       if keys not in document_schemas:
#         document_schemas.append(keys)
#     # print(f'available schemas\n{document_schemas}')

#     ## get all the schema keys in collection, this will be used to mask keys
#     all_schema_keys = list(set(sum(document_schemas, [])))
#     # print(f'all schema keys\n{all_schema_keys}')

#     ## find duplicate documents per schema
#     for schema in document_schemas:
#       ## get all _id schema keys used in aggregate query to match duplicate documents
#       id_keys = {k:f'${k}' for k in schema}
#       include_keys = {k:{ "$exists": True } for k in schema}

#       ## find all keys in all_schema_keys not in this document schema
#       exclude_keys_list = list(set(all_schema_keys) - set(schema))
#       # exclude_keys_list.append('test_key')
#       exclude_keys = {k:{ "$exists": False} for k in exclude_keys_list}

#       ## merge include keys {'$exists': True} + exclude keys{'$exists': False} for the first $match filter to ensure only records with the exact same keys as schema are matched
#       include_keys.update(exclude_keys)
#       # print(f'document match query for this schema\n{include_keys}')

#       ## return the content of duplicate documents
#       # debug
#       # print('\n')
#       # print(f'document schema\n{schema}')
#       # print(f'mask documents with exact keys\n{include_keys}')
#       # print(f'search documents with these keys by matching value (should match schema)\n{id_keys}')
#       duplicates = collection.aggregate([
#         {
#           "$match": include_keys
#         },
#         { "$group": {
#             "_id": id_keys,
#             "count": {"$sum": 1}
#           }
#         },
#         { "$match": {
#             "count": {"$gte": 2}
#           }
#         },
#         { "$sort": {
#             "count": -1
#           }
#         }
#       ])
#       # print(dumps(duplicates, indent=4))

#       ## loop duplicate document content, aggregate search using schema keys mask and get document object_ids for deletion
#       for duplicate_document in duplicates:
#         query = {k:v for k, v in duplicate_document['_id'].items()}
#         # print(query)
#         filtered_result = collection.aggregate([
#           {
#             "$match": include_keys
#           },
#           {
#             "$match": query
#           },
#         ])
#         object_ids = [r['_id'] for r in filtered_result]

#         ## remove the first duplicate document_id, this will be the only remaining document
#         object_ids.pop(0)
#         # print(object_ids)
#         delete_object_ids.extend(object_ids)

#     ## get unique document_ids
#     delete_object_ids = list(set(delete_object_ids))

#     ## list object_ids of duplicate records
#     if mode == 'list':
#       print("mode = 'list'\n")
#       print(f'object_ids to delete\n{delete_object_ids}')

#     ## show duplicate records
#     if mode == 'show':
#       print("mode = 'show'\n")
#       query = { "_id" : { "$in" : delete_object_ids } }
#       result = collection.find(query)
#       print('documents to delete')
#       for r in result:
#         print(r)
#       print(f'\ndocument ids to delete\n{delete_object_ids}')

#     ## remove duplicate documents
#     if mode == 'delete':
#       if len(delete_object_ids) >0:
#         requests = [DeleteMany({ "_id": { "$in": delete_object_ids } })]
#         result = collection.bulk_write(requests)
#         return result


# ## origional merge_to_collection, no schema key match to qualify documents to merge
# def merge_to_collection(src_collection, dst_collection, ignore_schema_keys = ['_id'], exclude_schema_keys = ['_id']):
#   last_modified = datetime.datetime.now(tz=datetime.timezone.utc)
#   requests = []
#   src_result = src_collection.find()
#   for r in src_result:
#     src_document = r
#     filter = {k:v for k, v in r.items() if k not in ignore_schema_keys} # dont want crypto_map_interface
#     dst_match_count = dst_collection.count_documents(filter)
#     if dst_match_count >0:
#       dst_result = dst_collection.find(filter)
#       for dst_match in dst_result:
#         dst_id = dst_match['_id']
#         # merge src document fields with dst document, overwrite dst key/value pairs
#         dst_document = {**dst_match, **src_document} # z = {**x, **y} y replaces x
#         for exclude in exclude_schema_keys:
#           if exclude in dst_document:
#             dst_document.pop(exclude)
#         dst_document.update({'last_modified': last_modified})
#         requests.append(UpdateOne({'_id': dst_id}, {'$set': dst_document}, upsert=True))
#     else:
#       dst_document = src_document
#       for exclude in exclude_schema_keys:
#         if exclude in dst_document:
#           dst_document.pop(exclude)
#       dst_document.update({'last_modified': last_modified})
#       requests.append(InsertOne(dst_document))

#   if len(requests) >0:
#     dst_result = dst_collection.bulk_write(requests)
#     print(dst_result.bulk_api_result)
#     return dst_result