irods_testbench_training_py.../covid_csv_upload/phe_upload.py

383 lines
17 KiB
Python
Executable File

#import os
import argparse
import pathlib
import toml
from datetime import datetime
from irods.session import iRODSSession
from irods.column import Criterion
from irods.models import DataObject, Collection
parser = argparse.ArgumentParser(__file__)
parser.add_argument("--confirm", help="auto continue script when a file entry fails validation", action="store_true")
args = parser.parse_args()
configFile = 'config.toml'
fileList = 'upload_files.toml'
logFile = str(pathlib.Path(__file__).name.strip('.py') + '.' + datetime.now().isoformat().split(".")[0] + '.log').replace(':','.')
exampleConfig = """\
# required connect
[connect]
iRODS_host = '192.168.150.56'
iRODS_port = '1247'
iRODS_user = 'rods'
iRODS_password = 'password'
iRODS_zone = 'OCF'
# required files
[files]
# required at least one entry
[files.irods]
path = 'C:\PHE_iRODS'
extension = [".md", ".exe"]
recursive = true
collection = '/OCF/home/rods/test'
# required for windows_create_date
[files.irods.metadata]
windows_create_date = true
# optional additional metadata, U not required
[files.irods.metadata.company]
A = 'company'
V = 'OCF'
U = 'string'
[files.irods.metadata.department]
A = 'department'
V = 'Cloud'\
"""
def findConfig(exampleConfig, configFile):
if not pathlib.Path(configFile).exists():
with open(configFile,'w') as config:
config.write(exampleConfig)
print('config missing, wrote example to ' + configFile + ' please modify this config and re-run')
quit()
else:
try:
configDict = toml.load(configFile)
except:
quit('invalid TOML format: ' + configFile)
return configDict
def parseConfig(config):
# check missing entries
reqEntry = ['connect', 'files']
for i in reqEntry:
try:
test = config[i]
except:
quit('missing config entry: ' + i)
# check empty entries
for i in reqEntry:
if not len(config[i]):
quit('empty config entry: ' + i)
# check connect entry
try:
iRODS_host = config['connect']['iRODS_host']
iRODS_port = config['connect']['iRODS_port']
iRODS_user = config['connect']['iRODS_user']
iRODS_password = config['connect']['iRODS_password']
iRODS_zone = config['connect']['iRODS_zone']
except:
quit('missing config entry: connect')
# check files entries
for i in config['files']:
# check search and put entries
try:
filesPath = config['files'][i]['path']
filesExtension = config['files'][i]['extension']
filesRecursive = config['files'][i]['recursive']
filesCollection = config['files'][i]['collection']
except:
quit('missing config entry in: files.' + i)
# check extension is list
if type(filesExtension) is not list:
quit('extension is not a list entry in: files.' + i + '.extension')
# check if recursive is bool
if type(filesRecursive) is not bool:
quit('recursive is not a boolean entry in: files.' + i + '.recursive')
# check metadata windows_create_date
try:
files_metadata = config['files'][i]['metadata']
except:
quit('missing config entry in: files.' + i + '.metadata')
if not len(config['files'][i]['metadata']):
quit('empty metadata entry in: files.' + i + '.metadata')
try:
test = config['files'][i]['metadata']['windows_create_date']
except:
quit('missing config entry: files.' + i + '.metadata.windows_create_date')
if type(config['files'][i]['metadata']['windows_create_date']) is not bool:
quit('windows_create_date is not a boolean entry in metadata')
# check metadata contain requisite fields
for j in config['files'][i]['metadata']:
if j != 'windows_create_date':
try:
meta_att = config['files'][i]['metadata'][j]['A']
meta_val = config['files'][i]['metadata'][j]['V']
except:
quit('missing config entry in: files.' + i + '.metadata.' + j)
def findFiles(dir, ext, recursive):
files = []
for item in pathlib.Path(dir).iterdir():
if item.is_dir() and recursive:
files = files + findFiles(item.absolute(), ext, recursive)
elif item.is_file() and item.suffix in ext:
files.append(str(item.absolute()))
return(files)
def createFileList(config, fileList):
if not pathlib.Path(fileList).exists():
files = []
with open(fileList, "a") as file_object:
output = {}
record = 0
for i in config['files']:
# collect required config params
filesPath = config['files'][i]['path']
filesExtension = config['files'][i]['extension']
filesRecursive = config['files'][i]['recursive']
filesCollection = config['files'][i]['collection']
# check path exists, may want to check collection exists
if not pathlib.Path(filesPath).exists():
quit('invalid path entry in: files.' + i + '.path')
# find all files that match the rules
files = files + findFiles(filesPath,filesExtension,filesRecursive)
metas = []
for j in config['files'][i]['metadata']:
if j != 'windows_create_date':
meta = []
meta.append(config['files'][i]['metadata'][j]['A'])
meta.append(config['files'][i]['metadata'][j]['V'])
if 'U' in config['files'][i]['metadata'][j]:
meta.append(config['files'][i]['metadata'][j]['U'])
metas.append(meta)
# update file /rule dict
for k in files:
record += 1
# add windows_create_date metadata
if config['files'][i]['metadata']['windows_create_date']:
winMeta = []
localFile = pathlib.Path(k)
win_epoc = str(localFile.stat().st_ctime).split(".")[0]
win_ctime = datetime.fromtimestamp(localFile.stat().st_ctime)
win_date = str(datetime.date(win_ctime))
win_time = str((datetime.time(win_ctime))).split(".")[0]
winMeta = [['date', win_date], ['time', win_time], ['date_epoc', win_epoc], ['year', str(format(win_ctime.year, '04'))], ['month',str(format(win_ctime.month, '02'))], ['day',str(format(win_ctime.day, '02'))]]
newMeta = metas + winMeta
output.update({ str(record): {'file': k, 'collection': filesCollection, 'metadata': newMeta}})
else:
# build file dict { 1: { 'file' : 'C:\\file.py, 'collection': '/OCF/home/rods', 'metadata': [['A','V','U'],['A','V','U']] }}
output.update({ str(record): {'file': k, 'collection': filesCollection, 'metadata': metas}})
# write file list as toml to easily be edited manually
file_object.write(toml.dumps(output))
print('\nfile list did not exist, created ' + str(record) + ' entries: ' + fileList + '\n' + '\ncheck content, add/remove or use as a template for your own file list\n' + '\nrerun this script to continue\n')
return 0
else:
print('\nfile list exists: ' + fileList + '\n')
return 1
# may want continue confirmation here
def writeLog(message, entry):
with open(logFile,'w') as log:
log.write(message + toml.dumps(entry))
#print(message + toml.dumps(entry))
def getConnect(config):
# create iRODS connection session object
iRODS_host = config['connect']['iRODS_host']
iRODS_port = config['connect']['iRODS_port']
iRODS_user = config['connect']['iRODS_user']
iRODS_password = config['connect']['iRODS_password']
iRODS_zone = config['connect']['iRODS_zone']
sessioniRODS = iRODSSession(host=iRODS_host, port=iRODS_port, user=iRODS_user, password=iRODS_password, zone=iRODS_zone)
sessioniRODS.connection_timeout = 300
return sessioniRODS
def uploadFiles(fileContent, config):
failedUpload = []
successUpload = []
logFailedUpload = {}
logSuccessUpload = {}
sessioniRODS = getConnect(config)
with sessioniRODS as session:
for i in fileContent:
filePath = fileContent[i]['file']
objName = pathlib.Path(filePath).name
objCollection = fileContent[i]['collection']
objPath = objCollection + '/' + objName
metaData = fileContent[i]['metadata']
try:
session.data_objects.put(filePath, objPath) # upload
obj = session.data_objects.get(objPath)
for j in metaData:
A = j[0]
V = j[1]
if len(j) > 2:
U = j[2]
obj.metadata.add(A, V, U)
else:
obj.metadata.add(A, V)
successUpload.append(i)
print('\nuploaded file to iRODS :' + objPath)
except:
failedUpload.append(i)
print('\nfailed to upload file to iRODS :' + objPath)
#print(session.get_connection_refresh_time())
if len(successUpload):
for i in successUpload:
logSuccessUpload.update({i:fileContent[i]})
writeLog('\n#### iRODS successful upload / metadata tag ####\n\n', logSuccessUpload)
if len(failedUpload):
for i in failedUpload:
logFailedUpload.update({i:fileContent[i]})
writeLog('\n#### iRODS failed upload / metadata tag ####\n\n', logFailedUpload)
def prepUploadFiles(fileList, config):
# validation / logging function
fileContent = toml.load(fileList)
sessioniRODS = getConnect(config)
with sessioniRODS as session:
# check missing iRODS collection / existing iRODS object / missing local directory / missing local file
colNotFound = []
objFound = []
directoryNotFound = []
fileNotFound = []
for i in fileContent:
file_path = fileContent[i]['file']
directory = pathlib.Path(file_path).parent
obj_name = pathlib.Path(file_path).name
obj_collection = fileContent[i]['collection']
# check collection exists, check object exists
colQuery = session.query(Collection).filter(Criterion('=', Collection.name, obj_collection))
colExist = ""
for j in colQuery:
colExist = j[Collection.name]
if not len(colExist):
colNotFound.append(i)
else:
objQuery = session.query(DataObject).filter(Criterion('=', Collection.name, obj_collection)).filter(Criterion('=', DataObject.name, obj_name))
objExist = ""
for k in objQuery:
objExist = k[DataObject.name]
if len(objExist):
objFound.append(i)
# check local path exists, check local file exists
if not pathlib.Path(directory).exists():
directoryNotFound.append(i)
elif not pathlib.Path(file_path).exists():
fileNotFound.append(i)
# lists for possible combinations of rule failure
missingColMissingDir = [value for value in colNotFound if value in directoryNotFound]
missingColMissingFile = [value for value in colNotFound if value in fileNotFound]
existingObjMissingDir = [value for value in objFound if value in directoryNotFound]
existingObjMissingFile = [value for value in objFound if value in fileNotFound]
# create lists of broken entries and log
logMissingColMissingDir = {}
logMissingColMissingFile = {}
logExistingObjMissingDir = {}
logExistingObjMissingFile = {}
logColNotFound = {}
logObjFound = {}
logDirectoryNotFound = {}
logFileNotFound = {}
failedValidation = 0
removeFileContent = []
if len(missingColMissingDir):
print('\nmissing iRODS collection AND missing local path, check log\n')
for l in missingColMissingDir:
colNotFound.remove(l)
directoryNotFound.remove(l)
removeFileContent.append(l)
logMissingColMissingDir.update({l:fileContent[l]})
writeLog('\n#### Missing iRODS collection AND missing local path ####\n\n', logMissingColMissingDir)
failedValidation += 1
if len(missingColMissingFile):
print('\nmissing iRODS collection AND missing file in local path, check log\n')
for m in missingColMissingFile:
colNotFound.remove(m)
fileNotFound.remove(m)
removeFileContent.append(m)
logMissingColMissingFile.update({m:fileContent[m]})
writeLog('\n#### Missing iRODS collection AND missing file in local path ####\n\n', logMissingColMissingFile)
failedValidation += 1
if len(existingObjMissingDir):
print('\nexisting iRODS object AND missing local path, check log\n')
for n in existingObjMissingDir:
objFound.remove(n)
directoryNotFound.remove(n)
removeFileContent.append(n)
logExistingObjMissingDir.update({n:fileContent[n]})
writeLog('\n#### Existing iRODS object AND missing local path ####\n\n', logExistingObjMissingDir)
failedValidation += 1
if len(existingObjMissingFile):
print('\nexisting iRODS object AND missing file in local path, check log\n')
for o in existingObjMissingFile:
objFound.remove(o)
fileNotFound.remove(o)
removeFileContent.append(o)
logExistingObjMissingFile.update({o:fileContent[o]})
writeLog('\n#### Existing iRODS object AND missing file in local path ####\n\n', logExistingObjMissingFile)
failedValidation += 1
if len(colNotFound):
print('\nmissing iRODS collection, check log\n')
for p in colNotFound:
removeFileContent.append(p)
logColNotFound.update({p:fileContent[p]})
writeLog('\n#### Missing iRODS collection ####\n\n', logColNotFound)
failedValidation += 1
if len(objFound):
print('\nexisting iRODS object, check log\n')
for q in objFound:
removeFileContent.append(q)
logObjFound.update({q:fileContent[q]})
writeLog('\n#### Existing iRODS object ####\n\n', logObjFound)
failedValidation += 1
if len(directoryNotFound):
print('\nmissing local path, check log\n')
for r in directoryNotFound:
removeFileContent.append(r)
logDirectoryNotFound.update({r:fileContent[r]})
writeLog('\n#### Missing local path ####\n\n', logDirectoryNotFound)
failedValidation += 1
if len(fileNotFound):
print('\nmissing file in local path, check log\n')
for s in fileNotFound:
removeFileContent.append(s)
logFileNotFound.update({s:fileContent[s]})
writeLog('\n#### Missing file in local path ####\n\n', logFileNotFound)
failedValidation += 1
# continue
if failedValidation:
print('\nfailed validation for entry in ' + fileList + ', the entry will be skipped, check log ' + logFile + '\n')
if not args.confirm:
print('\n(to avoid this confirmation run the script with the argument \'--confirm\')\n')
#answer = raw_input("do you want to continue? y/n").lower() # python2
answer = input("\ndo you want to continue? y/n ").lower()
while True:
if answer == 'y' or answer == 'yes':
break
else:
quit()
# sort/unique entries and remove from fileContent
unique = []
if len(removeFileContent):
for n in removeFileContent:
if n not in unique:
unique.append(n)
removeFileContent = sorted(unique)
if len(removeFileContent):
for o in removeFileContent:
fileContent.pop(o)
#print(toml.dumps(fileContent))
return fileContent
def main():
configDict = findConfig(exampleConfig, configFile)
parseConfig(configDict)
listExist = createFileList(configDict, fileList)
if listExist:
validFileList = prepUploadFiles(fileList, configDict)
uploadFiles(validFileList, configDict)
if __name__ == "__main__":
main()