diff --git a/credentials.json b/credentials.json new file mode 100644 index 0000000..9029638 --- /dev/null +++ b/credentials.json @@ -0,0 +1,6 @@ +{ + "stock": [["miner", "miner"], ["root", "root"]], + "braiins": [["root", "root"]], + "luxos": [["root", "root"]], + // "vnish": [["user", "password"]] // apparently vnish's ssh port is closed? I messaged them on telegram asking about it. +} diff --git a/errors.json b/errors.json new file mode 100644 index 0000000..92f3ade --- /dev/null +++ b/errors.json @@ -0,0 +1,14 @@ +{ + "error_keywords": [ + "ERROR_TEMP_TOO_HIGH", + "ERROR_HASHRATE_TOO_LOW", + "ERROR_NETWORK_DISCONNECTED", + "ERROR_POWER_LOST: power voltage rise or drop", + "SWEEP_STRING", + "_pic_write_iic failed!", + "PLL read exceeded wait time", + "ERROR_SOC_INIT: soc init failed", + "fail to read 0:1", + "fail to write 0:1" + ] +} diff --git a/finder.py b/finder.py index 4260020..592d3e4 100644 --- a/finder.py +++ b/finder.py @@ -1,31 +1,29 @@ import paramiko import csv import re +import json -# Define credentials for different OS types -credentials = { - "stock": [("miner", "miner"), ("root", "root")], - "braiins": [("root", "root")], - "luxos": [("root", "root")], - "vnish": [("user", "password")] # Replace with correct credentials -} +# Load credentials from a JSON file +def load_credentials(file_path): + with open(file_path, 'r') as file: + return json.load(file) -# List of error keywords -error_keywords = [ - "ERROR_TEMP_TOO_HIGH", - "ERROR_HASHRATE_TOO_LOW", - "ERROR_NETWORK_DISCONNECTED", - "ERROR_POWER_LOST: power voltage rise or drop", - "SWEEP_STRING", - "_pic_write_iic failed!", - "PLL read exceeded wait time", - "ERROR_SOC_INIT: soc init failed", - "fail to read 0:1", - "fail to write 0:1" -] +# Load error keywords from a JSON file +def load_error_keywords(file_path): + with open(file_path, 'r') as file: + return json.load(file)['error_keywords'] -# Regex pattern for ASIC chip errors +# Define paths to the configuration files +CREDENTIALS_FILE = 'credentials.json' +ERRORS_FILE = 'errors.json' + +# Load credentials and error keywords +credentials = load_credentials(CREDENTIALS_FILE) +error_keywords = load_error_keywords(ERRORS_FILE) + +# Regex patterns for ASIC chip errors and power-off messages asic_pattern = re.compile(r"Chain\[(\d+)\]: find (\d+) asic, times \d+") +power_off_pattern = re.compile(r"Chain (\d+) only find (\d+) asic, will power off hash board (\d+)") # Function to read IP addresses from a file def read_ips(file_path): @@ -34,16 +32,17 @@ def read_ips(file_path): return [ip.strip() for ip in ips] # Function to check log files for keywords and ASIC errors -def check_logs(ip, ssh_client): +def check_logs(ip, ssh_client, worker_id): logs = [] - asic_errors = {} - correct_asic_count = {} + asic_errors = set() # Using set to avoid duplicate errors + results = [] try: print(f"Checking logs on {ip}") stdin, stdout, stderr = ssh_client.exec_command("find /var/log/ -type f") log_files = stdout.readlines() for log_file in log_files: log_file = log_file.strip() + print(f"Checking file: {log_file}") # Debug statement # Check if file should be ignored if log_file.endswith(('tmp', 'utmp', 'btmp', 'wtmp')): continue @@ -56,24 +55,29 @@ def check_logs(ip, ssh_client): stdin, stdout, stderr = ssh_client.exec_command(f"cat {log_file}") log_content = stdout.read().decode('utf-8', errors='ignore') + print(f"Content of {log_file}: {log_content[:500]}") # Debug statement to show part of the log content for keyword in error_keywords: if keyword in log_content: logs.append((log_file, keyword)) - # Check for ASIC chip errors + # Check for ASIC chip errors and power-off messages for match in asic_pattern.finditer(log_content): chain, asic_count = match.groups() asic_count = int(asic_count) - if chain not in asic_errors: - asic_errors[chain] = [] - asic_errors[chain].append(asic_count) + asic_errors.add((chain, asic_count)) + print(f"Chain {chain} has {asic_count} chips.") # Debug statement - # Determine the correct ASIC count for each chain - if chain not in correct_asic_count and asic_count > 0: - correct_asic_count[chain] = asic_count + # Check for power-off messages + for match in power_off_pattern.finditer(log_content): + chain, found_asic_count, board = match.groups() + found_asic_count = int(found_asic_count) + chain = int(chain) + print(f"Power-off message found: Chain {chain}, ASIC count: {found_asic_count}, Board: {board}") # Debug statement + results.append([worker_id, ip, "ASIC Error", f"Chain {chain} has failed with {found_asic_count} ASICs found and will power off hash board {board}"]) + except Exception as e: print(f"Error checking logs on {ip}: {e}") - return logs, asic_errors, correct_asic_count + return logs, asic_errors, results # Function to get worker ID def get_worker_id(ssh_client): @@ -85,7 +89,6 @@ def get_worker_id(ssh_client): match = re.search(r'"user" *: *"[^.]*\.(\w+)"', config_content) if match: worker_id = match.group(1) - print("Got worker ID: ", worker_id) else: worker_id = "Unknown" except Exception as e: @@ -112,19 +115,18 @@ def main(): ssh_client.connect(ip, username=username, password=password) connected = True worker_id = get_worker_id(ssh_client) - logs, asic_errors, correct_asic_count = check_logs(ip, ssh_client) + logs, asic_errors, asic_results = check_logs(ip, ssh_client, worker_id) + results.extend(asic_results) for log in logs: results.append([worker_id, ip, log[0], log[1]]) - # Check for ASIC chip errors and add to results - # Maybe think about this differntly. Look for "will power off hashboard" and *then* figure out why the hashboard has been powered off. - for chain, counts in asic_errors.items(): - if counts.count(0) >= 3: - results.append([worker_id, ip, "ASIC Error", f"Chain {chain} has {counts.count(0)} failed checks with 0 ASICs found"]) - elif chain in correct_asic_count: - expected_count = correct_asic_count[chain] - if any(count != expected_count and count > 0 for count in counts): - results.append([worker_id, ip, "ASIC Discrepancy", f"Chain {chain} has varying ASIC counts: {counts}"]) + unique_asic_errors = {} # Using a dictionary to store chain and failed check count. + for chain, asic_count in asic_errors: + for chain, asic_count in asic_errors: + failed_checks = unique_asic_errors.get(chain, 0) + 1 # array + unique_asic_errors[chain] = failed_checks + if asic_count == 0 and failed_checks == 3: + results.append([worker_id, ip, "ASIC Error", f"Chain {chain} has 3 failed checks with {asic_count} ASICs found"]) ssh_client.close() break @@ -138,6 +140,7 @@ def main(): writer = csv.writer(file) writer.writerow(["Worker ID", "IP Address", "Log File", "Error"]) writer.writerows(results) + print("Writing: ", results) print("Done") if __name__ == "__main__": diff --git a/readme.md b/readme.md index 1579cb3..fec4e0b 100644 --- a/readme.md +++ b/readme.md @@ -1,3 +1,3 @@ # Antminer error finder -Checks for specific strings in log files of Antminer machines. Currently works with stock firmware. Expand to Braiins & LuxOS. +Checks for specific strings in log files of Antminer machines. Currently works with stock firmware (and barely at that). Expand to Braiins & LuxOS. diff --git a/results.csv b/results.csv deleted file mode 100644 index b1f27ee..0000000 --- a/results.csv +++ /dev/null @@ -1,15 +0,0 @@ -Worker ID,IP Address,Log File,Error -mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-00-57/miner.log,ERROR_SOC_INIT: soc init failed -mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-00-57/messages,ERROR_SOC_INIT: soc init failed -mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-53-25/miner.log,ERROR_SOC_INIT: soc init failed -mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-53-25/messages,ERROR_SOC_INIT: soc init failed -mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-07-47/miner.log,ERROR_SOC_INIT: soc init failed -mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-07-47/messages,ERROR_SOC_INIT: soc init failed -mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_20-51-43/miner.log,ERROR_SOC_INIT: soc init failed -mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_20-51-43/messages,ERROR_SOC_INIT: soc init failed -mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-01-31/miner.log,ERROR_SOC_INIT: soc init failed -mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-01-31/messages,ERROR_SOC_INIT: soc init failed -mw1012,192.168.1.171,/var/log/test,ERROR_POWER_LOST: power voltage rise or drop -mw1012,192.168.1.171,ASIC Error,Chain 0 has 3 failed checks with 0 ASICs found -mw1012,192.168.1.171,ASIC Discrepancy,"Chain 1 has varying ASIC counts: [110, 110, 110, 110, 110, 110, 104, 104, 104]" -mw1012,192.168.1.171,ASIC Discrepancy,"Chain 2 has varying ASIC counts: [110, 110, 110, 110, 110, 110, 114]"