Antminer-error-finder/finder.py

161 lines
7 KiB
Python
Raw Normal View History

2024-05-25 07:45:09 +01:00
import paramiko
import csv
import re
2024-05-25 21:38:38 +01:00
import json
2024-05-25 07:45:09 +01:00
2024-05-25 21:38:38 +01:00
# Load credentials from a JSON file
def load_credentials(file_path):
with open(file_path, 'r') as file:
return json.load(file)
# Load error keywords and types from a JSON file
2024-05-25 21:38:38 +01:00
def load_error_keywords(file_path):
with open(file_path, 'r') as file:
return json.load(file)['error_keywords']
2024-05-25 07:45:09 +01:00
2024-05-25 21:38:38 +01:00
# Define paths to the configuration files
CREDENTIALS_FILE = 'credentials.json'
ERRORS_FILE = 'errors.json'
2024-05-25 07:45:09 +01:00
2024-05-25 21:38:38 +01:00
# Load credentials and error keywords
credentials = load_credentials(CREDENTIALS_FILE)
error_keywords = load_error_keywords(ERRORS_FILE)
# Regex patterns for ASIC chip errors and power-off messages
2024-05-25 07:45:09 +01:00
asic_pattern = re.compile(r"Chain\[(\d+)\]: find (\d+) asic, times \d+")
2024-05-25 21:38:38 +01:00
power_off_pattern = re.compile(r"Chain (\d+) only find (\d+) asic, will power off hash board (\d+)")
eeprom_error_pattern = re.compile(r"Data load fail for chain (\d+)\.")
chip_bin_pattern = re.compile(r"No chip bin, chain = (\d+)")
2024-05-25 07:45:09 +01:00
# Function to read IP addresses from a file
def read_ips(file_path):
with open(file_path, 'r') as file:
ips = file.readlines()
return [ip.strip() for ip in ips]
# Function to check log files for keywords and ASIC errors
2024-05-25 21:38:38 +01:00
def check_logs(ip, ssh_client, worker_id):
2024-05-25 07:45:09 +01:00
logs = []
2024-05-25 21:38:38 +01:00
asic_errors = set() # Using set to avoid duplicate errors
results = set() # Using set to avoid duplicate entries
2024-05-25 07:45:09 +01:00
try:
print(f"Checking logs on {ip}")
stdin, stdout, stderr = ssh_client.exec_command("find /var/log/ -type f")
log_files = stdout.readlines()
for log_file in log_files:
log_file = log_file.strip()
2024-05-25 21:38:38 +01:00
print(f"Checking file: {log_file}") # Debug statement
2024-05-25 07:45:09 +01:00
# Check if file should be ignored
if log_file.endswith(('tmp', 'utmp', 'btmp', 'wtmp')):
continue
# Check if file is a binary file
stdin, stdout, stderr = ssh_client.exec_command(f"file {log_file}")
file_type = stdout.read().decode('utf-8')
if 'text' not in file_type:
continue
stdin, stdout, stderr = ssh_client.exec_command(f"cat {log_file}")
log_content = stdout.read().decode('utf-8', errors='ignore')
2024-05-25 21:38:38 +01:00
print(f"Content of {log_file}: {log_content[:500]}") # Debug statement to show part of the log content
for keyword, error_type in error_keywords.items():
2024-05-25 07:45:09 +01:00
if keyword in log_content:
logs.append((log_file, error_type, keyword))
2024-05-25 07:45:09 +01:00
2024-05-25 21:38:38 +01:00
# Check for ASIC chip errors and power-off messages
2024-05-25 07:45:09 +01:00
for match in asic_pattern.finditer(log_content):
chain, asic_count = match.groups()
asic_count = int(asic_count)
2024-05-25 21:38:38 +01:00
asic_errors.add((chain, asic_count))
print(f"Chain {chain} has {asic_count} chips.") # Debug statement
2024-05-25 07:45:09 +01:00
2024-05-25 21:38:38 +01:00
# Check for power-off messages
for match in power_off_pattern.finditer(log_content):
chain, found_asic_count, board = match.groups()
found_asic_count = int(found_asic_count)
chain = int(chain)
print(f"Power-off message found: Chain {chain}, ASIC count: {found_asic_count}, Board: {board}") # Debug statement
results.add((worker_id, ip, log_file, "ASIC Error", f"Chain {chain} has failed with {found_asic_count} ASICs found and will power off hash board {board}"))
# Check for EEPROM errors
for match in eeprom_error_pattern.finditer(log_content):
chain = match.group(1)
print(f"EEPROM error found: Chain {chain}") # Debug statement
results.add((worker_id, ip, log_file, "EEPROM Error", f"Data load fail for chain {chain}"))
# Check for chip bin errors
for match in chip_bin_pattern.finditer(log_content):
chain = match.group(1)
print(f"Chip bin error found: Chain {chain}") # Debug statement
results.add((worker_id, ip, log_file, "Chip Bin Error", f"No chip bin for chain {chain}"))
2024-05-25 21:38:38 +01:00
2024-05-25 07:45:09 +01:00
except Exception as e:
print(f"Error checking logs on {ip}: {e}")
2024-05-25 21:38:38 +01:00
return logs, asic_errors, results
2024-05-25 07:45:09 +01:00
# Function to get worker ID
def get_worker_id(ssh_client):
try:
print("Getting worker ID")
stdin, stdout, stderr = ssh_client.exec_command("cat /config/cgminer.conf")
config_content = stdout.read().decode('utf-8')
# Extract the worker ID from the user field
match = re.search(r'"user" *: *"[^.]*\.(\w+)"', config_content)
if match:
worker_id = match.group(1)
2024-05-25 23:48:17 +01:00
print(f"Got Worker ID: ", worker_id)
2024-05-25 07:45:09 +01:00
else:
worker_id = "Unknown"
except Exception as e:
print(f"Error getting worker ID: {e}")
worker_id = "Unknown"
return worker_id
# Main function to iterate over IPs and check for errors
def main():
ips = read_ips('ips.txt')
results = set() # Using set to avoid duplicate entries
2024-05-25 07:45:09 +01:00
for ip in ips:
print(f"Processing IP: {ip}")
connected = False
for os_type, creds in credentials.items():
if connected:
break
for username, password in creds:
ssh_client = paramiko.SSHClient()
ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
try:
print(f"Trying {username}:{password} on {ip}")
ssh_client.connect(ip, username=username, password=password)
connected = True
worker_id = get_worker_id(ssh_client)
2024-05-25 21:38:38 +01:00
logs, asic_errors, asic_results = check_logs(ip, ssh_client, worker_id)
results.update(asic_results)
2024-05-25 07:45:09 +01:00
for log in logs:
results.add((worker_id, ip, log[0], log[1], log[2]))
2024-05-25 07:45:09 +01:00
2024-05-25 21:38:38 +01:00
unique_asic_errors = {} # Using a dictionary to store chain and failed check count.
for chain, asic_count in asic_errors:
failed_checks = unique_asic_errors.get(chain, 0) + 1
unique_asic_errors[chain] = failed_checks
if asic_count == 0 and failed_checks == 3:
results.add((worker_id, ip, log_file, "ASIC Error", f"Chain {chain} has 3 failed checks with {asic_count} ASICs found"))
2024-05-25 07:45:09 +01:00
ssh_client.close()
break
except Exception as e:
print(f"Connection failed for {ip} with {username}:{password} - {e}")
ssh_client.close()
# Write results to CSV
print("Writing results to CSV")
with open('results.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["Worker ID", "IP Address", "Log File", "Error Type", "Error Message"])
2024-05-25 07:45:09 +01:00
writer.writerows(results)
print("Done")
if __name__ == "__main__":
main()