added column to csv, added more regex, added more error keywords
This commit is contained in:
parent
e91bb9c89b
commit
078dfd4219
5 changed files with 49 additions and 29 deletions
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"stock": [["miner", "miner"], ["root", "root"]],
|
"stock": [["miner", "miner"], ["root", "root"]],
|
||||||
"braiins": [["root", "root"]],
|
"braiins": [["root", "root"]],
|
||||||
"luxos": [["root", "root"]],
|
"luxos": [["root", "root"]]
|
||||||
// "vnish": [["user", "password"]] // apparently vnish's ssh port is closed? I messaged them on telegram asking about it.
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
1
credentials.json.comments
Normal file
1
credentials.json.comments
Normal file
|
@ -0,0 +1 @@
|
||||||
|
vnish does not have an ssh option. I messaged their developers on telegram and they said it was closed. :(
|
26
errors.json
26
errors.json
|
@ -1,14 +1,16 @@
|
||||||
{
|
{
|
||||||
"error_keywords": [
|
"error_keywords": {
|
||||||
"ERROR_TEMP_TOO_HIGH",
|
"ERROR_TEMP_TOO_HIGH": "Temperature Error",
|
||||||
"ERROR_HASHRATE_TOO_LOW",
|
"ERROR_NETWORK_DISCONNECTED": "probably doesn't exist",
|
||||||
"ERROR_NETWORK_DISCONNECTED",
|
"ERROR_POWER_LOST: power voltage rise or drop": "voltage drop",
|
||||||
"ERROR_POWER_LOST: power voltage rise or drop",
|
"_pic_write_iic failed!": "PIC Error",
|
||||||
"SWEEP_STRING",
|
"PLL read exceeded wait time": "PLL Error",
|
||||||
"_pic_write_iic failed!",
|
"ERROR_SOC_INIT: soc init failed": "SoC failure",
|
||||||
"PLL read exceeded wait time",
|
"fail to read 0:1": "eeprom",
|
||||||
"ERROR_SOC_INIT: soc init failed",
|
"fail to write 0:1": "eeprom",
|
||||||
"fail to read 0:1",
|
"bitmain_get_power_status failed": "PSU",
|
||||||
"fail to write 0:1"
|
"power voltage can not meet the target": "PSU",
|
||||||
]
|
"reg crc error": "black hole",
|
||||||
|
"ERROR_POWER_LOST: pic check voltage drop": "voltage drop"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
42
finder.py
42
finder.py
|
@ -8,7 +8,7 @@ def load_credentials(file_path):
|
||||||
with open(file_path, 'r') as file:
|
with open(file_path, 'r') as file:
|
||||||
return json.load(file)
|
return json.load(file)
|
||||||
|
|
||||||
# Load error keywords from a JSON file
|
# Load error keywords and types from a JSON file
|
||||||
def load_error_keywords(file_path):
|
def load_error_keywords(file_path):
|
||||||
with open(file_path, 'r') as file:
|
with open(file_path, 'r') as file:
|
||||||
return json.load(file)['error_keywords']
|
return json.load(file)['error_keywords']
|
||||||
|
@ -24,6 +24,8 @@ error_keywords = load_error_keywords(ERRORS_FILE)
|
||||||
# Regex patterns for ASIC chip errors and power-off messages
|
# Regex patterns for ASIC chip errors and power-off messages
|
||||||
asic_pattern = re.compile(r"Chain\[(\d+)\]: find (\d+) asic, times \d+")
|
asic_pattern = re.compile(r"Chain\[(\d+)\]: find (\d+) asic, times \d+")
|
||||||
power_off_pattern = re.compile(r"Chain (\d+) only find (\d+) asic, will power off hash board (\d+)")
|
power_off_pattern = re.compile(r"Chain (\d+) only find (\d+) asic, will power off hash board (\d+)")
|
||||||
|
eeprom_error_pattern = re.compile(r"Data load fail for chain (\d+)\.")
|
||||||
|
chip_bin_pattern = re.compile(r"No chip bin, chain = (\d+)")
|
||||||
|
|
||||||
# Function to read IP addresses from a file
|
# Function to read IP addresses from a file
|
||||||
def read_ips(file_path):
|
def read_ips(file_path):
|
||||||
|
@ -35,7 +37,7 @@ def read_ips(file_path):
|
||||||
def check_logs(ip, ssh_client, worker_id):
|
def check_logs(ip, ssh_client, worker_id):
|
||||||
logs = []
|
logs = []
|
||||||
asic_errors = set() # Using set to avoid duplicate errors
|
asic_errors = set() # Using set to avoid duplicate errors
|
||||||
results = []
|
results = set() # Using set to avoid duplicate entries
|
||||||
try:
|
try:
|
||||||
print(f"Checking logs on {ip}")
|
print(f"Checking logs on {ip}")
|
||||||
stdin, stdout, stderr = ssh_client.exec_command("find /var/log/ -type f")
|
stdin, stdout, stderr = ssh_client.exec_command("find /var/log/ -type f")
|
||||||
|
@ -56,9 +58,9 @@ def check_logs(ip, ssh_client, worker_id):
|
||||||
stdin, stdout, stderr = ssh_client.exec_command(f"cat {log_file}")
|
stdin, stdout, stderr = ssh_client.exec_command(f"cat {log_file}")
|
||||||
log_content = stdout.read().decode('utf-8', errors='ignore')
|
log_content = stdout.read().decode('utf-8', errors='ignore')
|
||||||
print(f"Content of {log_file}: {log_content[:500]}") # Debug statement to show part of the log content
|
print(f"Content of {log_file}: {log_content[:500]}") # Debug statement to show part of the log content
|
||||||
for keyword in error_keywords:
|
for keyword, error_type in error_keywords.items():
|
||||||
if keyword in log_content:
|
if keyword in log_content:
|
||||||
logs.append((log_file, keyword))
|
logs.append((log_file, error_type, keyword))
|
||||||
|
|
||||||
# Check for ASIC chip errors and power-off messages
|
# Check for ASIC chip errors and power-off messages
|
||||||
for match in asic_pattern.finditer(log_content):
|
for match in asic_pattern.finditer(log_content):
|
||||||
|
@ -73,7 +75,19 @@ def check_logs(ip, ssh_client, worker_id):
|
||||||
found_asic_count = int(found_asic_count)
|
found_asic_count = int(found_asic_count)
|
||||||
chain = int(chain)
|
chain = int(chain)
|
||||||
print(f"Power-off message found: Chain {chain}, ASIC count: {found_asic_count}, Board: {board}") # Debug statement
|
print(f"Power-off message found: Chain {chain}, ASIC count: {found_asic_count}, Board: {board}") # Debug statement
|
||||||
results.append([worker_id, ip, "ASIC Error", f"Chain {chain} has failed with {found_asic_count} ASICs found and will power off hash board {board}"])
|
results.add((worker_id, ip, log_file, "ASIC Error", f"Chain {chain} has failed with {found_asic_count} ASICs found and will power off hash board {board}"))
|
||||||
|
|
||||||
|
# Check for EEPROM errors
|
||||||
|
for match in eeprom_error_pattern.finditer(log_content):
|
||||||
|
chain = match.group(1)
|
||||||
|
print(f"EEPROM error found: Chain {chain}") # Debug statement
|
||||||
|
results.add((worker_id, ip, log_file, "EEPROM Error", f"Data load fail for chain {chain}"))
|
||||||
|
|
||||||
|
# Check for chip bin errors
|
||||||
|
for match in chip_bin_pattern.finditer(log_content):
|
||||||
|
chain = match.group(1)
|
||||||
|
print(f"Chip bin error found: Chain {chain}") # Debug statement
|
||||||
|
results.add((worker_id, ip, log_file, "Chip Bin Error", f"No chip bin for chain {chain}"))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error checking logs on {ip}: {e}")
|
print(f"Error checking logs on {ip}: {e}")
|
||||||
|
@ -99,7 +113,7 @@ def get_worker_id(ssh_client):
|
||||||
# Main function to iterate over IPs and check for errors
|
# Main function to iterate over IPs and check for errors
|
||||||
def main():
|
def main():
|
||||||
ips = read_ips('ips.txt')
|
ips = read_ips('ips.txt')
|
||||||
results = []
|
results = set() # Using set to avoid duplicate entries
|
||||||
|
|
||||||
for ip in ips:
|
for ip in ips:
|
||||||
print(f"Processing IP: {ip}")
|
print(f"Processing IP: {ip}")
|
||||||
|
@ -116,17 +130,16 @@ def main():
|
||||||
connected = True
|
connected = True
|
||||||
worker_id = get_worker_id(ssh_client)
|
worker_id = get_worker_id(ssh_client)
|
||||||
logs, asic_errors, asic_results = check_logs(ip, ssh_client, worker_id)
|
logs, asic_errors, asic_results = check_logs(ip, ssh_client, worker_id)
|
||||||
results.extend(asic_results)
|
results.update(asic_results)
|
||||||
for log in logs:
|
for log in logs:
|
||||||
results.append([worker_id, ip, log[0], log[1]])
|
results.add((worker_id, ip, log[0], log[1], log[2]))
|
||||||
|
|
||||||
unique_asic_errors = {} # Using a dictionary to store chain and failed check count.
|
unique_asic_errors = {} # Using a dictionary to store chain and failed check count.
|
||||||
for chain, asic_count in asic_errors:
|
for chain, asic_count in asic_errors:
|
||||||
for chain, asic_count in asic_errors:
|
failed_checks = unique_asic_errors.get(chain, 0) + 1
|
||||||
failed_checks = unique_asic_errors.get(chain, 0) + 1 # array
|
unique_asic_errors[chain] = failed_checks
|
||||||
unique_asic_errors[chain] = failed_checks
|
if asic_count == 0 and failed_checks == 3:
|
||||||
if asic_count == 0 and failed_checks == 3:
|
results.add((worker_id, ip, log_file, "ASIC Error", f"Chain {chain} has 3 failed checks with {asic_count} ASICs found"))
|
||||||
results.append([worker_id, ip, "ASIC Error", f"Chain {chain} has 3 failed checks with {asic_count} ASICs found"])
|
|
||||||
|
|
||||||
ssh_client.close()
|
ssh_client.close()
|
||||||
break
|
break
|
||||||
|
@ -138,9 +151,8 @@ def main():
|
||||||
print("Writing results to CSV")
|
print("Writing results to CSV")
|
||||||
with open('results.csv', 'w', newline='') as file:
|
with open('results.csv', 'w', newline='') as file:
|
||||||
writer = csv.writer(file)
|
writer = csv.writer(file)
|
||||||
writer.writerow(["Worker ID", "IP Address", "Log File", "Error"])
|
writer.writerow(["Worker ID", "IP Address", "Log File", "Error Type", "Error Message"])
|
||||||
writer.writerows(results)
|
writer.writerows(results)
|
||||||
print("Writing: ", results)
|
|
||||||
print("Done")
|
print("Done")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
5
warnings.json
Normal file
5
warnings.json
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
{
|
||||||
|
"warning_keywords": {
|
||||||
|
"the ECC used on your system is too weak compared to the one required by the NAND chip": "warning"
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue