This gets cleaner.
This commit is contained in:
parent
6c4bf10bcb
commit
e91bb9c89b
5 changed files with 67 additions and 59 deletions
6
credentials.json
Normal file
6
credentials.json
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
{
|
||||||
|
"stock": [["miner", "miner"], ["root", "root"]],
|
||||||
|
"braiins": [["root", "root"]],
|
||||||
|
"luxos": [["root", "root"]],
|
||||||
|
// "vnish": [["user", "password"]] // apparently vnish's ssh port is closed? I messaged them on telegram asking about it.
|
||||||
|
}
|
14
errors.json
Normal file
14
errors.json
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
{
|
||||||
|
"error_keywords": [
|
||||||
|
"ERROR_TEMP_TOO_HIGH",
|
||||||
|
"ERROR_HASHRATE_TOO_LOW",
|
||||||
|
"ERROR_NETWORK_DISCONNECTED",
|
||||||
|
"ERROR_POWER_LOST: power voltage rise or drop",
|
||||||
|
"SWEEP_STRING",
|
||||||
|
"_pic_write_iic failed!",
|
||||||
|
"PLL read exceeded wait time",
|
||||||
|
"ERROR_SOC_INIT: soc init failed",
|
||||||
|
"fail to read 0:1",
|
||||||
|
"fail to write 0:1"
|
||||||
|
]
|
||||||
|
}
|
89
finder.py
89
finder.py
|
@ -1,31 +1,29 @@
|
||||||
import paramiko
|
import paramiko
|
||||||
import csv
|
import csv
|
||||||
import re
|
import re
|
||||||
|
import json
|
||||||
|
|
||||||
# Define credentials for different OS types
|
# Load credentials from a JSON file
|
||||||
credentials = {
|
def load_credentials(file_path):
|
||||||
"stock": [("miner", "miner"), ("root", "root")],
|
with open(file_path, 'r') as file:
|
||||||
"braiins": [("root", "root")],
|
return json.load(file)
|
||||||
"luxos": [("root", "root")],
|
|
||||||
"vnish": [("user", "password")] # Replace with correct credentials
|
|
||||||
}
|
|
||||||
|
|
||||||
# List of error keywords
|
# Load error keywords from a JSON file
|
||||||
error_keywords = [
|
def load_error_keywords(file_path):
|
||||||
"ERROR_TEMP_TOO_HIGH",
|
with open(file_path, 'r') as file:
|
||||||
"ERROR_HASHRATE_TOO_LOW",
|
return json.load(file)['error_keywords']
|
||||||
"ERROR_NETWORK_DISCONNECTED",
|
|
||||||
"ERROR_POWER_LOST: power voltage rise or drop",
|
|
||||||
"SWEEP_STRING",
|
|
||||||
"_pic_write_iic failed!",
|
|
||||||
"PLL read exceeded wait time",
|
|
||||||
"ERROR_SOC_INIT: soc init failed",
|
|
||||||
"fail to read 0:1",
|
|
||||||
"fail to write 0:1"
|
|
||||||
]
|
|
||||||
|
|
||||||
# Regex pattern for ASIC chip errors
|
# Define paths to the configuration files
|
||||||
|
CREDENTIALS_FILE = 'credentials.json'
|
||||||
|
ERRORS_FILE = 'errors.json'
|
||||||
|
|
||||||
|
# Load credentials and error keywords
|
||||||
|
credentials = load_credentials(CREDENTIALS_FILE)
|
||||||
|
error_keywords = load_error_keywords(ERRORS_FILE)
|
||||||
|
|
||||||
|
# Regex patterns for ASIC chip errors and power-off messages
|
||||||
asic_pattern = re.compile(r"Chain\[(\d+)\]: find (\d+) asic, times \d+")
|
asic_pattern = re.compile(r"Chain\[(\d+)\]: find (\d+) asic, times \d+")
|
||||||
|
power_off_pattern = re.compile(r"Chain (\d+) only find (\d+) asic, will power off hash board (\d+)")
|
||||||
|
|
||||||
# Function to read IP addresses from a file
|
# Function to read IP addresses from a file
|
||||||
def read_ips(file_path):
|
def read_ips(file_path):
|
||||||
|
@ -34,16 +32,17 @@ def read_ips(file_path):
|
||||||
return [ip.strip() for ip in ips]
|
return [ip.strip() for ip in ips]
|
||||||
|
|
||||||
# Function to check log files for keywords and ASIC errors
|
# Function to check log files for keywords and ASIC errors
|
||||||
def check_logs(ip, ssh_client):
|
def check_logs(ip, ssh_client, worker_id):
|
||||||
logs = []
|
logs = []
|
||||||
asic_errors = {}
|
asic_errors = set() # Using set to avoid duplicate errors
|
||||||
correct_asic_count = {}
|
results = []
|
||||||
try:
|
try:
|
||||||
print(f"Checking logs on {ip}")
|
print(f"Checking logs on {ip}")
|
||||||
stdin, stdout, stderr = ssh_client.exec_command("find /var/log/ -type f")
|
stdin, stdout, stderr = ssh_client.exec_command("find /var/log/ -type f")
|
||||||
log_files = stdout.readlines()
|
log_files = stdout.readlines()
|
||||||
for log_file in log_files:
|
for log_file in log_files:
|
||||||
log_file = log_file.strip()
|
log_file = log_file.strip()
|
||||||
|
print(f"Checking file: {log_file}") # Debug statement
|
||||||
# Check if file should be ignored
|
# Check if file should be ignored
|
||||||
if log_file.endswith(('tmp', 'utmp', 'btmp', 'wtmp')):
|
if log_file.endswith(('tmp', 'utmp', 'btmp', 'wtmp')):
|
||||||
continue
|
continue
|
||||||
|
@ -56,24 +55,29 @@ def check_logs(ip, ssh_client):
|
||||||
|
|
||||||
stdin, stdout, stderr = ssh_client.exec_command(f"cat {log_file}")
|
stdin, stdout, stderr = ssh_client.exec_command(f"cat {log_file}")
|
||||||
log_content = stdout.read().decode('utf-8', errors='ignore')
|
log_content = stdout.read().decode('utf-8', errors='ignore')
|
||||||
|
print(f"Content of {log_file}: {log_content[:500]}") # Debug statement to show part of the log content
|
||||||
for keyword in error_keywords:
|
for keyword in error_keywords:
|
||||||
if keyword in log_content:
|
if keyword in log_content:
|
||||||
logs.append((log_file, keyword))
|
logs.append((log_file, keyword))
|
||||||
|
|
||||||
# Check for ASIC chip errors
|
# Check for ASIC chip errors and power-off messages
|
||||||
for match in asic_pattern.finditer(log_content):
|
for match in asic_pattern.finditer(log_content):
|
||||||
chain, asic_count = match.groups()
|
chain, asic_count = match.groups()
|
||||||
asic_count = int(asic_count)
|
asic_count = int(asic_count)
|
||||||
if chain not in asic_errors:
|
asic_errors.add((chain, asic_count))
|
||||||
asic_errors[chain] = []
|
print(f"Chain {chain} has {asic_count} chips.") # Debug statement
|
||||||
asic_errors[chain].append(asic_count)
|
|
||||||
|
# Check for power-off messages
|
||||||
|
for match in power_off_pattern.finditer(log_content):
|
||||||
|
chain, found_asic_count, board = match.groups()
|
||||||
|
found_asic_count = int(found_asic_count)
|
||||||
|
chain = int(chain)
|
||||||
|
print(f"Power-off message found: Chain {chain}, ASIC count: {found_asic_count}, Board: {board}") # Debug statement
|
||||||
|
results.append([worker_id, ip, "ASIC Error", f"Chain {chain} has failed with {found_asic_count} ASICs found and will power off hash board {board}"])
|
||||||
|
|
||||||
# Determine the correct ASIC count for each chain
|
|
||||||
if chain not in correct_asic_count and asic_count > 0:
|
|
||||||
correct_asic_count[chain] = asic_count
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error checking logs on {ip}: {e}")
|
print(f"Error checking logs on {ip}: {e}")
|
||||||
return logs, asic_errors, correct_asic_count
|
return logs, asic_errors, results
|
||||||
|
|
||||||
# Function to get worker ID
|
# Function to get worker ID
|
||||||
def get_worker_id(ssh_client):
|
def get_worker_id(ssh_client):
|
||||||
|
@ -85,7 +89,6 @@ def get_worker_id(ssh_client):
|
||||||
match = re.search(r'"user" *: *"[^.]*\.(\w+)"', config_content)
|
match = re.search(r'"user" *: *"[^.]*\.(\w+)"', config_content)
|
||||||
if match:
|
if match:
|
||||||
worker_id = match.group(1)
|
worker_id = match.group(1)
|
||||||
print("Got worker ID: ", worker_id)
|
|
||||||
else:
|
else:
|
||||||
worker_id = "Unknown"
|
worker_id = "Unknown"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -112,19 +115,18 @@ def main():
|
||||||
ssh_client.connect(ip, username=username, password=password)
|
ssh_client.connect(ip, username=username, password=password)
|
||||||
connected = True
|
connected = True
|
||||||
worker_id = get_worker_id(ssh_client)
|
worker_id = get_worker_id(ssh_client)
|
||||||
logs, asic_errors, correct_asic_count = check_logs(ip, ssh_client)
|
logs, asic_errors, asic_results = check_logs(ip, ssh_client, worker_id)
|
||||||
|
results.extend(asic_results)
|
||||||
for log in logs:
|
for log in logs:
|
||||||
results.append([worker_id, ip, log[0], log[1]])
|
results.append([worker_id, ip, log[0], log[1]])
|
||||||
|
|
||||||
# Check for ASIC chip errors and add to results
|
unique_asic_errors = {} # Using a dictionary to store chain and failed check count.
|
||||||
# Maybe think about this differntly. Look for "will power off hashboard" and *then* figure out why the hashboard has been powered off.
|
for chain, asic_count in asic_errors:
|
||||||
for chain, counts in asic_errors.items():
|
for chain, asic_count in asic_errors:
|
||||||
if counts.count(0) >= 3:
|
failed_checks = unique_asic_errors.get(chain, 0) + 1 # array
|
||||||
results.append([worker_id, ip, "ASIC Error", f"Chain {chain} has {counts.count(0)} failed checks with 0 ASICs found"])
|
unique_asic_errors[chain] = failed_checks
|
||||||
elif chain in correct_asic_count:
|
if asic_count == 0 and failed_checks == 3:
|
||||||
expected_count = correct_asic_count[chain]
|
results.append([worker_id, ip, "ASIC Error", f"Chain {chain} has 3 failed checks with {asic_count} ASICs found"])
|
||||||
if any(count != expected_count and count > 0 for count in counts):
|
|
||||||
results.append([worker_id, ip, "ASIC Discrepancy", f"Chain {chain} has varying ASIC counts: {counts}"])
|
|
||||||
|
|
||||||
ssh_client.close()
|
ssh_client.close()
|
||||||
break
|
break
|
||||||
|
@ -138,6 +140,7 @@ def main():
|
||||||
writer = csv.writer(file)
|
writer = csv.writer(file)
|
||||||
writer.writerow(["Worker ID", "IP Address", "Log File", "Error"])
|
writer.writerow(["Worker ID", "IP Address", "Log File", "Error"])
|
||||||
writer.writerows(results)
|
writer.writerows(results)
|
||||||
|
print("Writing: ", results)
|
||||||
print("Done")
|
print("Done")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
# Antminer error finder
|
# Antminer error finder
|
||||||
|
|
||||||
Checks for specific strings in log files of Antminer machines. Currently works with stock firmware. Expand to Braiins & LuxOS.
|
Checks for specific strings in log files of Antminer machines. Currently works with stock firmware (and barely at that). Expand to Braiins & LuxOS.
|
||||||
|
|
15
results.csv
15
results.csv
|
@ -1,15 +0,0 @@
|
||||||
Worker ID,IP Address,Log File,Error
|
|
||||||
mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-00-57/miner.log,ERROR_SOC_INIT: soc init failed
|
|
||||||
mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-00-57/messages,ERROR_SOC_INIT: soc init failed
|
|
||||||
mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-53-25/miner.log,ERROR_SOC_INIT: soc init failed
|
|
||||||
mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-53-25/messages,ERROR_SOC_INIT: soc init failed
|
|
||||||
mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-07-47/miner.log,ERROR_SOC_INIT: soc init failed
|
|
||||||
mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-07-47/messages,ERROR_SOC_INIT: soc init failed
|
|
||||||
mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_20-51-43/miner.log,ERROR_SOC_INIT: soc init failed
|
|
||||||
mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_20-51-43/messages,ERROR_SOC_INIT: soc init failed
|
|
||||||
mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-01-31/miner.log,ERROR_SOC_INIT: soc init failed
|
|
||||||
mw1012,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-01-31/messages,ERROR_SOC_INIT: soc init failed
|
|
||||||
mw1012,192.168.1.171,/var/log/test,ERROR_POWER_LOST: power voltage rise or drop
|
|
||||||
mw1012,192.168.1.171,ASIC Error,Chain 0 has 3 failed checks with 0 ASICs found
|
|
||||||
mw1012,192.168.1.171,ASIC Discrepancy,"Chain 1 has varying ASIC counts: [110, 110, 110, 110, 110, 110, 104, 104, 104]"
|
|
||||||
mw1012,192.168.1.171,ASIC Discrepancy,"Chain 2 has varying ASIC counts: [110, 110, 110, 110, 110, 110, 114]"
|
|
|
Loading…
Reference in a new issue