# Replicate the environment Python script to replicate grep -r. ## Python venv ```sh python3 -m venv --prompt part2 ~/part2/.venv source ~/part2/.venv/bin/activate python --version # Python 3.10.6 pip install --upgrade pip ``` ## enter/exit venv ```sh source $HOME/part2/.venv/bin/activate deactivate ``` ## Install python dependencies ```sh nano -cw pip_requirements.txt binaryornot==0.4.4 chardet==5.0.0 python -m pip install -r pip_requirements.txt ``` # Foreword There are no classes, threads or data structures in these scripts, this would take me much longer as a non native coder. Its all functions, simple lists, defacto libraries and raw print to terminal, it shouldnt be painfully slow. # 1st attempt script - This script explored loop mechanism and parsing the RE from the cli, it did not handle errors well and the results against grep were not very consistent. ``` python test1.py '.*pass.*' '/home/openstack/stack /home/openstack/.ssh /home/openstack/openstack_prep.sh' | wc -l 29 grep -r '.*pass.*' /home/openstack/stack /home/openstack/.ssh /home/openstack/openstack_prep.sh | wc -l grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/local/2.2.3/linux_amd64/terraform-provider-local_v2.2.3_x5: binary file matches grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/null/3.2.1/linux_amd64/terraform-provider-null_v3.2.1_x5: binary file matches grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/time/0.9.1/linux_amd64/terraform-provider-time_v0.9.1_x5: binary file matches grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/terraform-provider-openstack/openstack/1.48.0/linux_amd64/terraform-provider-openstack_v1.48.0: binary file matches 29 ``` Script was named `test1.py` ```python import argparse import pathlib import re from binaryornot.check import is_binary parser = argparse.ArgumentParser() parser.add_argument("pattern", help="[Regex]") parser.add_argument("files", help="[File File ...]") pattern = re.compile(parser.parse_args().pattern) filesPath = parser.parse_args().files pathList = re.split('\s+', filesPath) # behavioural vars filesRecursive = 1 hideBin = 1 def validInput(path): paths = [] for item in path: if not pathlib.Path(item).exists(): print(' %s: No such file or directory' % (item)) else: paths = paths + [item] return(paths) def findFiles(path, recursive): files = [] if pathlib.Path(path).is_file(): files.append(str(pathlib.Path(path).absolute())) elif pathlib.Path(path).is_dir(): for item in pathlib.Path(path).iterdir(): if item.is_dir() and recursive: files = files + findFiles(item.absolute(), recursive) elif item.is_file(): files.append(str(item.absolute())) return(files) def findPattern(files): for file in files: if not is_binary(file): for i, line in enumerate(open(file)): match = re.search(pattern, line) if match is not None: print('%s: %s' % (file, (match.string).strip())) elif not hideBin: print('%s: binary file excluded' % (file)) def main(): fileList = [] validPath = validInput(pathList) for item in validPath: fileList = fileList + findFiles(item,filesRecursive) # print(fileList) findPattern(fileList) if __name__ == "__main__": main() ``` # 2nd attempt script - This script was intended to handle missing paths, permission issues and odd files such as binaries gracefully. - The recursive looping mechanism was changed to try to emulate the order of of grep results output. - To try to keep parity between grep and pathlib word count list output, stderr was used. - I guess whatever RE engine is used in the shell is different from standard Python, what is used in this script will likely show differences as more complex queries are built. For time and complexity this was not explored. Grep handles greedy matches much more gracefully. Script was named `test2.py` ```python import argparse import sys import pathlib import re parser = argparse.ArgumentParser() parser.add_argument("pattern", help="[Regex]") parser.add_argument("files", help="[File File ...]") pattern = re.compile(parser.parse_args().pattern) filesPath = parser.parse_args().files pathList = re.split('\s+', filesPath) fileList = [] filesRecursive = 1 def findDirs(path, recursive): for item in pathlib.Path(path).iterdir(): try: if item.is_file(): findFiles((str(item.absolute())), recursive) except (Exception) as f: sys.stderr.write('%s: %s \n' % (path, f)) try: if pathlib.Path(item).is_dir() and recursive: findDirs(item, recursive) except (Exception) as d: sys.stderr.write('%s: %s \n' % (path, d)) def findFiles(path, recursive): if not pathlib.Path(path).exists(): sys.stderr.write('%s: No such file or directory \n' % (path)) else: try: if pathlib.Path(path).is_file(): fileList.append(str(pathlib.Path(path).absolute())) except (Exception) as f: sys.stderr.write('%s: %s \n' % (path, f)) try: if pathlib.Path(path).is_dir(): findDirs(path, recursive) except (Exception) as d: sys.stderr.write('%s: %s \n' % (path, d)) def findPattern(files): for file in files: try: with open(file) as f: for i, line in enumerate(f): match = re.search(pattern, line) if match is not None: print('%s: %s' % (file, (match.string).strip())) # generally this would be the place to change messaging to match grep format and add indicators such as line numbers except (PermissionError) as g: sys.stderr.write('%s: %s \n' % (file, g)) except (Exception) as h: sys.stderr.write('%s: %s \n' % (file, h)) def main(): global fileList for item in pathList: findFiles(item,filesRecursive) findPattern(fileList) if __name__ == "__main__": main() ``` ## Results Matching comparisons. ```sh ## 1 python test2.py '.*pass.*' '/home/openstack/stack /home/openstack/.ssh /home/openstack/openstack_prep.sh' | wc -l /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/local/2.2.3/linux_amd64/terraform-provider-local_v2.2.3_x5: 'utf-8' codec can't decode byte 0xc0 in position 24: invalid start byte /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/null/3.2.1/linux_amd64/terraform-provider-null_v3.2.1_x5: 'utf-8' codec can't decode byte 0xe0 in position 24: invalid continuation byte /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/time/0.9.1/linux_amd64/terraform-provider-time_v0.9.1_x5: 'utf-8' codec can't decode byte 0xe0 in position 24: invalid continuation byte /home/openstack/stack/.terraform/providers/registry.terraform.io/terraform-provider-openstack/openstack/1.48.0/linux_amd64/terraform-provider-openstack_v1.48.0: 'utf-8' codec can't decode byte 0xc8 in position 40: invalid continuation byte 29 grep -r '.*pass.*' /home/openstack/stack /home/openstack/.ssh /home/openstack/openstack_prep.sh | wc -l grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/local/2.2.3/linux_amd64/terraform-provider-local_v2.2.3_x5: binary file matches grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/null/3.2.1/linux_amd64/terraform-provider-null_v3.2.1_x5: binary file matches grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/time/0.9.1/linux_amd64/terraform-provider-time_v0.9.1_x5: binary file matches grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/terraform-provider-openstack/openstack/1.48.0/linux_amd64/terraform-provider-openstack_v1.48.0: binary file matches 29 ## 2 python test2.py '^.*auth.*"Password0"$' '/home/openstack/stack /home/.ssh' | wc -l /home/.ssh: No such file or directory /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/local/2.2.3/linux_amd64/terraform-provider-local_v2.2.3_x5: 'utf-8' codec can't decode byte 0xc0 in position 24: invalid start byte /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/null/3.2.1/linux_amd64/terraform-provider-null_v3.2.1_x5: 'utf-8' codec can't decode byte 0xe0 in position 24: invalid continuation byte /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/time/0.9.1/linux_amd64/terraform-provider-time_v0.9.1_x5: 'utf-8' codec can't decode byte 0xe0 in position 24: invalid continuation byte /home/openstack/stack/.terraform/providers/registry.terraform.io/terraform-provider-openstack/openstack/1.48.0/linux_amd64/terraform-provider-openstack_v1.48.0: 'utf-8' codec can't decode byte 0xc8 in position 40: invalid continuation byte 2 grep -r '^.*auth.*"Password0"$' /home/openstack/stack /home/.ssh | wc -l grep: /home/.ssh: No such file or directory 2 ## 3 python test2.py Password. /home/openstack/stack | wc -l /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/local/2.2.3/linux_amd64/terraform-provider-local_v2.2.3_x5: 'utf-8' codec can't decode byte 0xc0 in position 24: invalid start byte /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/null/3.2.1/linux_amd64/terraform-provider-null_v3.2.1_x5: 'utf-8' codec can't decode byte 0xe0 in position 24: invalid continuation byte /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/time/0.9.1/linux_amd64/terraform-provider-time_v0.9.1_x5: 'utf-8' codec can't decode byte 0xe0 in position 24: invalid continuation byte /home/openstack/stack/.terraform/providers/registry.terraform.io/terraform-provider-openstack/openstack/1.48.0/linux_amd64/terraform-provider-openstack_v1.48.0: 'utf-8' codec can't decode byte 0xc8 in position 40: invalid continuation byte 9 grep -r Password. /home/openstack/stack | wc -l grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/local/2.2.3/linux_amd64/terraform-provider-local_v2.2.3_x5: binary file matches grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/null/3.2.1/linux_amd64/terraform-provider-null_v3.2.1_x5: binary file matches grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/time/0.9.1/linux_amd64/terraform-provider-time_v0.9.1_x5: binary file matches grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/terraform-provider-openstack/openstack/1.48.0/linux_amd64/terraform-provider-openstack_v1.48.0: binary file matches 9 ## 4 python test2.py '^.*word' /home/openstack/stack | wc -l /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/local/2.2.3/linux_amd64/terraform-provider-local_v2.2.3_x5: 'utf-8' codec can't decode byte 0xc0 in position 24: invalid start byte /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/null/3.2.1/linux_amd64/terraform-provider-null_v3.2.1_x5: 'utf-8' codec can't decode byte 0xe0 in position 24: invalid continuation byte /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/time/0.9.1/linux_amd64/terraform-provider-time_v0.9.1_x5: 'utf-8' codec can't decode byte 0xe0 in position 24: invalid continuation byte /home/openstack/stack/.terraform/providers/registry.terraform.io/terraform-provider-openstack/openstack/1.48.0/linux_amd64/terraform-provider-openstack_v1.48.0: 'utf-8' codec can't decode byte 0xc8 in position 40: invalid continuation byte 17 grep -r '^.*word' /home/openstack/stack | wc -l grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/local/2.2.3/linux_amd64/terraform-provider-local_v2.2.3_x5: binary file matches grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/null/3.2.1/linux_amd64/terraform-provider-null_v3.2.1_x5: binary file matches grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/hashicorp/time/0.9.1/linux_amd64/terraform-provider-time_v0.9.1_x5: binary file matches grep: /home/openstack/stack/.terraform/providers/registry.terraform.io/terraform-provider-openstack/openstack/1.48.0/linux_amd64/terraform-provider-openstack_v1.48.0: binary file matches 17 ``` - The parity between the grep and pathlib results ends here, the script starts to unravel requiring much more logic to emulate grep. - Grep can traverse some binary files that pathlib cannot, whilst pathlib lists files as permission denied where grep will not traverse the containing directory with knowledge that it cannot traverse the children. - There are many behavioural differences that would require isolation then logic building to fully emulate results for grep. ```sh ## 5 python test2.py '^.*a' /etc | wc -l /etc/ssl: [Errno 13] Permission denied: '/etc/ssl/private' ... /etc/ssh/ssh_host_ecdsa_key: [Errno 13] Permission denied: '/etc/ssh/ssh_host_ecdsa_key' 37248 grep -r '^.*a' /etc | wc -l grep: /etc/gshadow: Permission denied ... grep: /etc/ssh/ssh_host_ecdsa_key: Permission denied 26294 ```