left results for demo purposes, refactored a bit, started using pandas
This commit is contained in:
parent
2e6f06eb1d
commit
c6b83b9f0e
3 changed files with 76 additions and 22 deletions
34
2024/May/27/results.csv
Normal file
34
2024/May/27/results.csv
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
Date,Worker ID,IP Address,Log File,Error Type,Error Message
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/domokun,ASIC Error,Chain 2 has failed with 0 ASICs found and will power off hash board 2
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/new2.log,ASIC Error,Chain 0 has failed with 0 ASICs found and will power off hash board 0
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426394.log,EEPROM Error,Data load fail for chain 1
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426394.log,Chip Bin Error,No chip bin for chain 0
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426394.log,Chip Bin Error,No chip bin for chain 1
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426394.log,Chip Bin Error,No chip bin for chain 2
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_6974116.log,ASIC Error,Chain 1 has failed with 0 ASICs found and will power off hash board 1
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426352.log,EEPROM Error,Data load fail for chain 1
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426352.log,EEPROM Error,Data load fail for chain 2
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/new1.log,ASIC Error,Chain 0 has failed with 96 ASICs found and will power off hash board 0
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/failures,ASIC Error,Chain 0 has failed with 0 ASICs found and will power off hash board 0
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/failures,ASIC Error,Chain 1 has failed with 104 ASICs found and will power off hash board 1
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/new2.log,SoC failure,ERROR_SOC_INIT: soc init failed
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_3174359.log,SoC failure,ERROR_SOC_INIT: soc init failed
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_3174359.log,PSU,bitmain_get_power_status failed
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_3174359.log,PSU,power voltage can not meet the target
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_4594191.log,voltage drop,ERROR_POWER_LOST: power voltage rise or drop
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_4594191.log,black hole,reg crc error
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426394.log,Temperature Error,ERROR_TEMP_TOO_HIGH
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_7426394.log,PIC Error,_pic_write_iic failed!
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-00-57/miner.log,SoC failure,ERROR_SOC_INIT: soc init failed
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-00-57/messages,SoC failure,ERROR_SOC_INIT: soc init failed
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-53-25/miner.log,SoC failure,ERROR_SOC_INIT: soc init failed
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-53-25/messages,SoC failure,ERROR_SOC_INIT: soc init failed
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-07-47/miner.log,SoC failure,ERROR_SOC_INIT: soc init failed
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_19-07-47/messages,SoC failure,ERROR_SOC_INIT: soc init failed
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_20-51-43/miner.log,SoC failure,ERROR_SOC_INIT: soc init failed
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_20-51-43/messages,SoC failure,ERROR_SOC_INIT: soc init failed
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-01-31/miner.log,SoC failure,ERROR_SOC_INIT: soc init failed
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/2024-03/06/cglog_init_2024-03-06_18-01-31/messages,SoC failure,ERROR_SOC_INIT: soc init failed
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/Miner_6024072.log,voltage drop,ERROR_POWER_LOST: pic check voltage drop
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/test,voltage drop,ERROR_POWER_LOST: power voltage rise or drop
|
||||||
|
2024-05-27,mw3446,192.168.1.171,/var/log/new1.log,SoC failure,ERROR_SOC_INIT: soc init failed
|
|
60
finder.py
60
finder.py
|
@ -1,7 +1,9 @@
|
||||||
import paramiko
|
import paramiko
|
||||||
import csv
|
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
# Load credentials from a JSON file
|
# Load credentials from a JSON file
|
||||||
def load_credentials(file_path):
|
def load_credentials(file_path):
|
||||||
|
@ -34,10 +36,10 @@ def read_ips(file_path):
|
||||||
return [ip.strip() for ip in ips]
|
return [ip.strip() for ip in ips]
|
||||||
|
|
||||||
# Function to check log files for keywords and ASIC errors
|
# Function to check log files for keywords and ASIC errors
|
||||||
def check_logs(ip, ssh_client, worker_id):
|
def check_logs(ip, ssh_client, worker_id, current_date):
|
||||||
logs = []
|
logs = []
|
||||||
asic_errors = set() # Using set to avoid duplicate errors
|
asic_errors = set() # Using set to avoid duplicate errors
|
||||||
results = set() # Using set to avoid duplicate entries
|
results = [] # Using list to avoid duplicate entries
|
||||||
try:
|
try:
|
||||||
print(f"Checking logs on {ip}")
|
print(f"Checking logs on {ip}")
|
||||||
stdin, stdout, stderr = ssh_client.exec_command("find /var/log/ -type f")
|
stdin, stdout, stderr = ssh_client.exec_command("find /var/log/ -type f")
|
||||||
|
@ -58,9 +60,13 @@ def check_logs(ip, ssh_client, worker_id):
|
||||||
stdin, stdout, stderr = ssh_client.exec_command(f"cat {log_file}")
|
stdin, stdout, stderr = ssh_client.exec_command(f"cat {log_file}")
|
||||||
log_content = stdout.read().decode('utf-8', errors='ignore')
|
log_content = stdout.read().decode('utf-8', errors='ignore')
|
||||||
print(f"Content of {log_file}: {log_content[:500]}") # Debug statement to show part of the log content
|
print(f"Content of {log_file}: {log_content[:500]}") # Debug statement to show part of the log content
|
||||||
|
|
||||||
|
# Track unique errors within this log file
|
||||||
|
seen_errors = set()
|
||||||
for keyword, error_type in error_keywords.items():
|
for keyword, error_type in error_keywords.items():
|
||||||
if keyword in log_content:
|
if keyword in log_content and (log_file, error_type, keyword) not in seen_errors:
|
||||||
logs.append((log_file, error_type, keyword))
|
logs.append((log_file, error_type, keyword))
|
||||||
|
seen_errors.add((log_file, error_type, keyword))
|
||||||
|
|
||||||
# Check for ASIC chip errors and power-off messages
|
# Check for ASIC chip errors and power-off messages
|
||||||
for match in asic_pattern.finditer(log_content):
|
for match in asic_pattern.finditer(log_content):
|
||||||
|
@ -74,20 +80,23 @@ def check_logs(ip, ssh_client, worker_id):
|
||||||
chain, found_asic_count, board = match.groups()
|
chain, found_asic_count, board = match.groups()
|
||||||
found_asic_count = int(found_asic_count)
|
found_asic_count = int(found_asic_count)
|
||||||
chain = int(chain)
|
chain = int(chain)
|
||||||
print(f"Power-off message found: Chain {chain}, ASIC count: {found_asic_count}, Board: {board}") # Debug statement
|
if (log_file, "ASIC Error", f"Chain {chain} has failed with {found_asic_count} ASICs found and will power off hash board {board}") not in seen_errors:
|
||||||
results.add((worker_id, ip, log_file, "ASIC Error", f"Chain {chain} has failed with {found_asic_count} ASICs found and will power off hash board {board}"))
|
results.append((current_date, worker_id, ip, log_file, "ASIC Error", f"Chain {chain} has failed with {found_asic_count} ASICs found and will power off hash board {board}"))
|
||||||
|
seen_errors.add((log_file, "ASIC Error", f"Chain {chain} has failed with {found_asic_count} ASICs found and will power off hash board {board}"))
|
||||||
|
|
||||||
# Check for EEPROM errors
|
# Check for EEPROM errors
|
||||||
for match in eeprom_error_pattern.finditer(log_content):
|
for match in eeprom_error_pattern.finditer(log_content):
|
||||||
chain = match.group(1)
|
chain = match.group(1)
|
||||||
print(f"EEPROM error found: Chain {chain}") # Debug statement
|
if (log_file, "EEPROM Error", f"Data load fail for chain {chain}") not in seen_errors:
|
||||||
results.add((worker_id, ip, log_file, "EEPROM Error", f"Data load fail for chain {chain}"))
|
results.append((current_date, worker_id, ip, log_file, "EEPROM Error", f"Data load fail for chain {chain}"))
|
||||||
|
seen_errors.add((log_file, "EEPROM Error", f"Data load fail for chain {chain}"))
|
||||||
|
|
||||||
# Check for chip bin errors
|
# Check for chip bin errors
|
||||||
for match in chip_bin_pattern.finditer(log_content):
|
for match in chip_bin_pattern.finditer(log_content):
|
||||||
chain = match.group(1)
|
chain = match.group(1)
|
||||||
print(f"Chip bin error found: Chain {chain}") # Debug statement
|
if (log_file, "Chip Bin Error", f"No chip bin for chain {chain}") not in seen_errors:
|
||||||
results.add((worker_id, ip, log_file, "Chip Bin Error", f"No chip bin for chain {chain}"))
|
results.append((current_date, worker_id, ip, log_file, "Chip Bin Error", f"No chip bin for chain {chain}"))
|
||||||
|
seen_errors.add((log_file, "Chip Bin Error", f"No chip bin for chain {chain}"))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error checking logs on {ip}: {e}")
|
print(f"Error checking logs on {ip}: {e}")
|
||||||
|
@ -114,7 +123,11 @@ def get_worker_id(ssh_client):
|
||||||
# Main function to iterate over IPs and check for errors
|
# Main function to iterate over IPs and check for errors
|
||||||
def main():
|
def main():
|
||||||
ips = read_ips('ips.txt')
|
ips = read_ips('ips.txt')
|
||||||
results = set() # Using set to avoid duplicate entries
|
results = [] # Using a list to collect results
|
||||||
|
current_date = datetime.now().strftime('%Y-%m-%d')
|
||||||
|
current_year = datetime.now().strftime('%Y')
|
||||||
|
current_month = datetime.now().strftime('%B')
|
||||||
|
current_day = datetime.now().strftime('%d')
|
||||||
|
|
||||||
for ip in ips:
|
for ip in ips:
|
||||||
print(f"Processing IP: {ip}")
|
print(f"Processing IP: {ip}")
|
||||||
|
@ -130,30 +143,35 @@ def main():
|
||||||
ssh_client.connect(ip, username=username, password=password)
|
ssh_client.connect(ip, username=username, password=password)
|
||||||
connected = True
|
connected = True
|
||||||
worker_id = get_worker_id(ssh_client)
|
worker_id = get_worker_id(ssh_client)
|
||||||
logs, asic_errors, asic_results = check_logs(ip, ssh_client, worker_id)
|
logs, asic_errors, asic_results = check_logs(ip, ssh_client, worker_id, current_date)
|
||||||
results.update(asic_results)
|
results.extend(asic_results)
|
||||||
for log in logs:
|
for log in logs:
|
||||||
results.add((worker_id, ip, log[0], log[1], log[2]))
|
results.append((current_date, worker_id, ip, log[0], log[1], log[2]))
|
||||||
|
|
||||||
unique_asic_errors = {} # Using a dictionary to store chain and failed check count.
|
unique_asic_errors = {} # Using a dictionary to store chain and failed check count.
|
||||||
for chain, asic_count in asic_errors:
|
for chain, asic_count in asic_errors:
|
||||||
failed_checks = unique_asic_errors.get(chain, 0) + 1
|
failed_checks = unique_asic_errors.get(chain, 0) + 1
|
||||||
unique_asic_errors[chain] = failed_checks
|
unique_asic_errors[chain] = failed_checks
|
||||||
if asic_count == 0 and failed_checks == 3:
|
if asic_count == 0 and failed_checks == 3:
|
||||||
results.add((worker_id, ip, log[0], "ASIC Error", f"Chain {chain} has 3 failed checks with {asic_count} ASICs found"))
|
results.append((current_date, worker_id, ip, log[0], "ASIC Error", f"Chain {chain} has 3 failed checks with {asic_count} ASICs found"))
|
||||||
|
|
||||||
ssh_client.close()
|
ssh_client.close()
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Connection failed for {ip} with {username}:{password} - {e}")
|
print(f"Connection failed for {ip} with {username}:{password} - {e}")
|
||||||
ssh_client.close()
|
ssh_client.close()
|
||||||
|
|
||||||
# Write results to CSV
|
# Create the directory structure
|
||||||
|
directory = os.path.join(current_year, current_month, current_day)
|
||||||
|
if not os.path.exists(directory):
|
||||||
|
os.makedirs(directory)
|
||||||
|
|
||||||
|
# Convert results to a DataFrame
|
||||||
|
results_df = pd.DataFrame(results, columns=["Date", "Worker ID", "IP Address", "Log File", "Error Type", "Error Message"])
|
||||||
|
|
||||||
|
# Save the results to a CSV file
|
||||||
print("Writing results to CSV")
|
print("Writing results to CSV")
|
||||||
with open('results.csv', 'w', newline='') as file:
|
results_df.to_csv(os.path.join(directory, 'results.csv'), index=False)
|
||||||
writer = csv.writer(file)
|
|
||||||
writer.writerow(["Worker ID", "IP Address", "Log File", "Error Type", "Error Message"])
|
|
||||||
writer.writerows(results)
|
|
||||||
print("Done")
|
print("Done")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
# Antminer error finder
|
# Antminer error finder
|
||||||
|
|
||||||
Checks for specific strings in log files of Antminer machines. Currently works with stock firmware (and barely at that). Expand to Braiins & LuxOS.
|
Checks for specific strings in log files of Antminer machines. Currently works with stock firmware (and barely at that). Expand to Braiins & LuxOS. This now outputs a csv file to a folder structure of <year>/<month>/<day>/.
|
||||||
|
|
||||||
|
Started using pandas for writing the csv and for grouping of errors based on machine.
|
||||||
|
|
Loading…
Reference in a new issue