left results for demo purposes, refactored a bit, started using pandas

This commit is contained in:
Tristan Smith 2024-05-27 14:54:12 -04:00
parent 2e6f06eb1d
commit c6b83b9f0e
3 changed files with 76 additions and 22 deletions

34
2024/May/27/results.csv Normal file
View file

@ -0,0 +1,34 @@
Date,Worker ID,IP Address,Log File,Error Type,Error Message
2024-05-27,mw3446,192.168.1.171,/var/log/domokun,ASIC Error,Chain 2 has failed with 0 ASICs found and will power off hash board 2
2024-05-27,mw3446,192.168.1.171,/var/log/new2.log,ASIC Error,Chain 0 has failed with 0 ASICs found and will power off hash board 0
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426394.log,EEPROM Error,Data load fail for chain 1
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426394.log,Chip Bin Error,No chip bin for chain 0
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426394.log,Chip Bin Error,No chip bin for chain 1
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426394.log,Chip Bin Error,No chip bin for chain 2
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_6974116.log,ASIC Error,Chain 1 has failed with 0 ASICs found and will power off hash board 1
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426352.log,EEPROM Error,Data load fail for chain 1
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426352.log,EEPROM Error,Data load fail for chain 2
2024-05-27,mw3446,192.168.1.171,/var/log/new1.log,ASIC Error,Chain 0 has failed with 96 ASICs found and will power off hash board 0
2024-05-27,mw3446,192.168.1.171,/var/log/failures,ASIC Error,Chain 0 has failed with 0 ASICs found and will power off hash board 0
2024-05-27,mw3446,192.168.1.171,/var/log/failures,ASIC Error,Chain 1 has failed with 104 ASICs found and will power off hash board 1
2024-05-27,mw3446,192.168.1.171,/var/log/new2.log,SoC failure,ERROR_SOC_INIT: soc init failed
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_3174359.log,SoC failure,ERROR_SOC_INIT: soc init failed
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_3174359.log,PSU,bitmain_get_power_status failed
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_3174359.log,PSU,power voltage can not meet the target
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_4594191.log,voltage drop,ERROR_POWER_LOST: power voltage rise or drop
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_4594191.log,black hole,reg crc error
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426394.log,Temperature Error,ERROR_TEMP_TOO_HIGH
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426394.log,PIC Error,_pic_write_iic failed!
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-00-57/miner.log,SoC failure,ERROR_SOC_INIT: soc init failed
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-00-57/messages,SoC failure,ERROR_SOC_INIT: soc init failed
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-53-25/miner.log,SoC failure,ERROR_SOC_INIT: soc init failed
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-53-25/messages,SoC failure,ERROR_SOC_INIT: soc init failed
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-07-47/miner.log,SoC failure,ERROR_SOC_INIT: soc init failed
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-07-47/messages,SoC failure,ERROR_SOC_INIT: soc init failed
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_20-51-43/miner.log,SoC failure,ERROR_SOC_INIT: soc init failed
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_20-51-43/messages,SoC failure,ERROR_SOC_INIT: soc init failed
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-01-31/miner.log,SoC failure,ERROR_SOC_INIT: soc init failed
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-01-31/messages,SoC failure,ERROR_SOC_INIT: soc init failed
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_6024072.log,voltage drop,ERROR_POWER_LOST: pic check voltage drop
2024-05-27,mw3446,192.168.1.171,/var/log/test,voltage drop,ERROR_POWER_LOST: power voltage rise or drop
2024-05-27,mw3446,192.168.1.171,/var/log/new1.log,SoC failure,ERROR_SOC_INIT: soc init failed
1 Date Worker ID IP Address Log File Error Type Error Message
2 2024-05-27 mw3446 192.168.1.171 /var/log/domokun ASIC Error Chain 2 has failed with 0 ASICs found and will power off hash board 2
3 2024-05-27 mw3446 192.168.1.171 /var/log/new2.log ASIC Error Chain 0 has failed with 0 ASICs found and will power off hash board 0
4 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_7426394.log EEPROM Error Data load fail for chain 1
5 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_7426394.log Chip Bin Error No chip bin for chain 0
6 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_7426394.log Chip Bin Error No chip bin for chain 1
7 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_7426394.log Chip Bin Error No chip bin for chain 2
8 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_6974116.log ASIC Error Chain 1 has failed with 0 ASICs found and will power off hash board 1
9 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_7426352.log EEPROM Error Data load fail for chain 1
10 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_7426352.log EEPROM Error Data load fail for chain 2
11 2024-05-27 mw3446 192.168.1.171 /var/log/new1.log ASIC Error Chain 0 has failed with 96 ASICs found and will power off hash board 0
12 2024-05-27 mw3446 192.168.1.171 /var/log/failures ASIC Error Chain 0 has failed with 0 ASICs found and will power off hash board 0
13 2024-05-27 mw3446 192.168.1.171 /var/log/failures ASIC Error Chain 1 has failed with 104 ASICs found and will power off hash board 1
14 2024-05-27 mw3446 192.168.1.171 /var/log/new2.log SoC failure ERROR_SOC_INIT: soc init failed
15 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_3174359.log SoC failure ERROR_SOC_INIT: soc init failed
16 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_3174359.log PSU bitmain_get_power_status failed
17 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_3174359.log PSU power voltage can not meet the target
18 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_4594191.log voltage drop ERROR_POWER_LOST: power voltage rise or drop
19 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_4594191.log black hole reg crc error
20 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_7426394.log Temperature Error ERROR_TEMP_TOO_HIGH
21 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_7426394.log PIC Error _pic_write_iic failed!
22 2024-05-27 mw3446 192.168.1.171 /var/log/2024-03/06/cglog_init_2024-03-06_19-00-57/miner.log SoC failure ERROR_SOC_INIT: soc init failed
23 2024-05-27 mw3446 192.168.1.171 /var/log/2024-03/06/cglog_init_2024-03-06_19-00-57/messages SoC failure ERROR_SOC_INIT: soc init failed
24 2024-05-27 mw3446 192.168.1.171 /var/log/2024-03/06/cglog_init_2024-03-06_18-53-25/miner.log SoC failure ERROR_SOC_INIT: soc init failed
25 2024-05-27 mw3446 192.168.1.171 /var/log/2024-03/06/cglog_init_2024-03-06_18-53-25/messages SoC failure ERROR_SOC_INIT: soc init failed
26 2024-05-27 mw3446 192.168.1.171 /var/log/2024-03/06/cglog_init_2024-03-06_19-07-47/miner.log SoC failure ERROR_SOC_INIT: soc init failed
27 2024-05-27 mw3446 192.168.1.171 /var/log/2024-03/06/cglog_init_2024-03-06_19-07-47/messages SoC failure ERROR_SOC_INIT: soc init failed
28 2024-05-27 mw3446 192.168.1.171 /var/log/2024-03/06/cglog_init_2024-03-06_20-51-43/miner.log SoC failure ERROR_SOC_INIT: soc init failed
29 2024-05-27 mw3446 192.168.1.171 /var/log/2024-03/06/cglog_init_2024-03-06_20-51-43/messages SoC failure ERROR_SOC_INIT: soc init failed
30 2024-05-27 mw3446 192.168.1.171 /var/log/2024-03/06/cglog_init_2024-03-06_18-01-31/miner.log SoC failure ERROR_SOC_INIT: soc init failed
31 2024-05-27 mw3446 192.168.1.171 /var/log/2024-03/06/cglog_init_2024-03-06_18-01-31/messages SoC failure ERROR_SOC_INIT: soc init failed
32 2024-05-27 mw3446 192.168.1.171 /var/log/Miner_6024072.log voltage drop ERROR_POWER_LOST: pic check voltage drop
33 2024-05-27 mw3446 192.168.1.171 /var/log/test voltage drop ERROR_POWER_LOST: power voltage rise or drop
34 2024-05-27 mw3446 192.168.1.171 /var/log/new1.log SoC failure ERROR_SOC_INIT: soc init failed

View file

@ -1,7 +1,9 @@
import paramiko import paramiko
import csv
import re import re
import json import json
import os
from datetime import datetime
import pandas as pd
# Load credentials from a JSON file # Load credentials from a JSON file
def load_credentials(file_path): def load_credentials(file_path):
@ -34,10 +36,10 @@ def read_ips(file_path):
return [ip.strip() for ip in ips] return [ip.strip() for ip in ips]
# Function to check log files for keywords and ASIC errors # Function to check log files for keywords and ASIC errors
def check_logs(ip, ssh_client, worker_id): def check_logs(ip, ssh_client, worker_id, current_date):
logs = [] logs = []
asic_errors = set() # Using set to avoid duplicate errors asic_errors = set() # Using set to avoid duplicate errors
results = set() # Using set to avoid duplicate entries results = [] # Using list to avoid duplicate entries
try: try:
print(f"Checking logs on {ip}") print(f"Checking logs on {ip}")
stdin, stdout, stderr = ssh_client.exec_command("find /var/log/ -type f") stdin, stdout, stderr = ssh_client.exec_command("find /var/log/ -type f")
@ -58,9 +60,13 @@ def check_logs(ip, ssh_client, worker_id):
stdin, stdout, stderr = ssh_client.exec_command(f"cat {log_file}") stdin, stdout, stderr = ssh_client.exec_command(f"cat {log_file}")
log_content = stdout.read().decode('utf-8', errors='ignore') log_content = stdout.read().decode('utf-8', errors='ignore')
print(f"Content of {log_file}: {log_content[:500]}") # Debug statement to show part of the log content print(f"Content of {log_file}: {log_content[:500]}") # Debug statement to show part of the log content
# Track unique errors within this log file
seen_errors = set()
for keyword, error_type in error_keywords.items(): for keyword, error_type in error_keywords.items():
if keyword in log_content: if keyword in log_content and (log_file, error_type, keyword) not in seen_errors:
logs.append((log_file, error_type, keyword)) logs.append((log_file, error_type, keyword))
seen_errors.add((log_file, error_type, keyword))
# Check for ASIC chip errors and power-off messages # Check for ASIC chip errors and power-off messages
for match in asic_pattern.finditer(log_content): for match in asic_pattern.finditer(log_content):
@ -74,20 +80,23 @@ def check_logs(ip, ssh_client, worker_id):
chain, found_asic_count, board = match.groups() chain, found_asic_count, board = match.groups()
found_asic_count = int(found_asic_count) found_asic_count = int(found_asic_count)
chain = int(chain) chain = int(chain)
print(f"Power-off message found: Chain {chain}, ASIC count: {found_asic_count}, Board: {board}") # Debug statement if (log_file, "ASIC Error", f"Chain {chain} has failed with {found_asic_count} ASICs found and will power off hash board {board}") not in seen_errors:
results.add((worker_id, ip, log_file, "ASIC Error", f"Chain {chain} has failed with {found_asic_count} ASICs found and will power off hash board {board}")) results.append((current_date, worker_id, ip, log_file, "ASIC Error", f"Chain {chain} has failed with {found_asic_count} ASICs found and will power off hash board {board}"))
seen_errors.add((log_file, "ASIC Error", f"Chain {chain} has failed with {found_asic_count} ASICs found and will power off hash board {board}"))
# Check for EEPROM errors # Check for EEPROM errors
for match in eeprom_error_pattern.finditer(log_content): for match in eeprom_error_pattern.finditer(log_content):
chain = match.group(1) chain = match.group(1)
print(f"EEPROM error found: Chain {chain}") # Debug statement if (log_file, "EEPROM Error", f"Data load fail for chain {chain}") not in seen_errors:
results.add((worker_id, ip, log_file, "EEPROM Error", f"Data load fail for chain {chain}")) results.append((current_date, worker_id, ip, log_file, "EEPROM Error", f"Data load fail for chain {chain}"))
seen_errors.add((log_file, "EEPROM Error", f"Data load fail for chain {chain}"))
# Check for chip bin errors # Check for chip bin errors
for match in chip_bin_pattern.finditer(log_content): for match in chip_bin_pattern.finditer(log_content):
chain = match.group(1) chain = match.group(1)
print(f"Chip bin error found: Chain {chain}") # Debug statement if (log_file, "Chip Bin Error", f"No chip bin for chain {chain}") not in seen_errors:
results.add((worker_id, ip, log_file, "Chip Bin Error", f"No chip bin for chain {chain}")) results.append((current_date, worker_id, ip, log_file, "Chip Bin Error", f"No chip bin for chain {chain}"))
seen_errors.add((log_file, "Chip Bin Error", f"No chip bin for chain {chain}"))
except Exception as e: except Exception as e:
print(f"Error checking logs on {ip}: {e}") print(f"Error checking logs on {ip}: {e}")
@ -114,7 +123,11 @@ def get_worker_id(ssh_client):
# Main function to iterate over IPs and check for errors # Main function to iterate over IPs and check for errors
def main(): def main():
ips = read_ips('ips.txt') ips = read_ips('ips.txt')
results = set() # Using set to avoid duplicate entries results = [] # Using a list to collect results
current_date = datetime.now().strftime('%Y-%m-%d')
current_year = datetime.now().strftime('%Y')
current_month = datetime.now().strftime('%B')
current_day = datetime.now().strftime('%d')
for ip in ips: for ip in ips:
print(f"Processing IP: {ip}") print(f"Processing IP: {ip}")
@ -130,30 +143,35 @@ def main():
ssh_client.connect(ip, username=username, password=password) ssh_client.connect(ip, username=username, password=password)
connected = True connected = True
worker_id = get_worker_id(ssh_client) worker_id = get_worker_id(ssh_client)
logs, asic_errors, asic_results = check_logs(ip, ssh_client, worker_id) logs, asic_errors, asic_results = check_logs(ip, ssh_client, worker_id, current_date)
results.update(asic_results) results.extend(asic_results)
for log in logs: for log in logs:
results.add((worker_id, ip, log[0], log[1], log[2])) results.append((current_date, worker_id, ip, log[0], log[1], log[2]))
unique_asic_errors = {} # Using a dictionary to store chain and failed check count. unique_asic_errors = {} # Using a dictionary to store chain and failed check count.
for chain, asic_count in asic_errors: for chain, asic_count in asic_errors:
failed_checks = unique_asic_errors.get(chain, 0) + 1 failed_checks = unique_asic_errors.get(chain, 0) + 1
unique_asic_errors[chain] = failed_checks unique_asic_errors[chain] = failed_checks
if asic_count == 0 and failed_checks == 3: if asic_count == 0 and failed_checks == 3:
results.add((worker_id, ip, log[0], "ASIC Error", f"Chain {chain} has 3 failed checks with {asic_count} ASICs found")) results.append((current_date, worker_id, ip, log[0], "ASIC Error", f"Chain {chain} has 3 failed checks with {asic_count} ASICs found"))
ssh_client.close() ssh_client.close()
break break
except Exception as e: except Exception as e:
print(f"Connection failed for {ip} with {username}:{password} - {e}") print(f"Connection failed for {ip} with {username}:{password} - {e}")
ssh_client.close() ssh_client.close()
# Write results to CSV # Create the directory structure
directory = os.path.join(current_year, current_month, current_day)
if not os.path.exists(directory):
os.makedirs(directory)
# Convert results to a DataFrame
results_df = pd.DataFrame(results, columns=["Date", "Worker ID", "IP Address", "Log File", "Error Type", "Error Message"])
# Save the results to a CSV file
print("Writing results to CSV") print("Writing results to CSV")
with open('results.csv', 'w', newline='') as file: results_df.to_csv(os.path.join(directory, 'results.csv'), index=False)
writer = csv.writer(file)
writer.writerow(["Worker ID", "IP Address", "Log File", "Error Type", "Error Message"])
writer.writerows(results)
print("Done") print("Done")
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -1,3 +1,5 @@
# Antminer error finder # Antminer error finder
Checks for specific strings in log files of Antminer machines. Currently works with stock firmware (and barely at that). Expand to Braiins & LuxOS. Checks for specific strings in log files of Antminer machines. Currently works with stock firmware (and barely at that). Expand to Braiins & LuxOS. This now outputs a csv file to a folder structure of <year>/<month>/<day>/.
Started using pandas for writing the csv and for grouping of errors based on machine.