Source code for palangre_syc.excel_extractions

""" 

Module de fonctions qui permettent l'extraction des données de logbook palangre 
selon le format utilisé par les Seychelles

"""
import pandas as pd
import numpy as np
import re

from django.utils.translation import gettext as _

from api_traitement import common_functions

[docs] def extract_vessel_info(df_donnees): """ Extraction des cases 'Vessel Information' Args: df_donnees (df): excel p1 Returns: df """ df_vessel = df_donnees.iloc[7:16, 0] # On sépare en deux colonnes selon ce qu'il y a avant et après les ':' df_vessel_clean = df_vessel.str.split(':', expand=True) # S'assurer que toutes les valeurs sont des chaînes de caractères df_vessel_clean = df_vessel_clean.map(lambda x: str(x).strip() if x is not None else '') df_vessel_clean.columns = ['Logbook_name', 'Value'] # On enlève les caractères spéciaux df_vessel_clean['Logbook_name'] = common_functions.remove_spec_char_from_list(df_vessel_clean['Logbook_name']) df_vessel_clean['Logbook_name'] = df_vessel_clean['Logbook_name'].apply(lambda x: str(x).strip() if x is not None else '') return df_vessel_clean
[docs] def extract_cruise_info(df_donnees, version): """ Extraction des cases 'Cruise Information' Args: df_donnees (df): excel p1 Returns: df """ # On extrait les données propres au 'Cruise information' if version == "ll_17.6": df_cruise1 = df_donnees.iloc[[9], [11, 16]] df_cruise2 = df_donnees.iloc[[9], [20, 25]] elif version == "ll_26": df_cruise1 = df_donnees.iloc[[9], [9, 14]] df_cruise2 = df_donnees.iloc[[9], [18, 23]] # Forcer les mêmes noms de colonnes df_cruise1.columns = ['Logbook_name', 'Value'] df_cruise2.columns = ['Logbook_name', 'Value'] # Concaténation verticale df_cruise = pd.concat([df_cruise1, df_cruise2], axis=0, ignore_index=True) # Nettoyer la colonne 'Logbook_name' en enlevant les espaces et les ':' df_cruise['Logbook_name'] = df_cruise.iloc[:, 0].str.replace(':', '').str.strip() # Appliquer la fonction strip sur les cellules de la colonne 'Value' si l'élément correspond à une zone de texte df_cruise['Value'] = df_cruise.iloc[:, 1].apply(lambda x: x.strip() if isinstance(x, str) else x) # Appliquer un filtre pour les caractères spéciaux dans la colonne 'Logbook_name' et 'Value' df_cruise['Logbook_name'] = common_functions.remove_spec_char_from_list(df_cruise['Logbook_name']) df_cruise['Value'] = common_functions.remove_spec_char_from_list(df_cruise['Value']) # Supprimer les espaces supplémentaires dans la colonne 'Logbook_name' df_cruise['Logbook_name'] = df_cruise['Logbook_name'].str.strip() return df_cruise
[docs] def extract_report_info(df_donnees, version): """ Extraction des cases 'Report Information' Args: df_donnees (df): excel p1 Returns: df """ if version == "ll_17.6": df_report = df_donnees.iloc[7:9, 29:35] elif version == "ll_26": df_report = df_donnees.iloc[7:9, 27:33] # On supprime les colonnes qui sont vides df_report = df_report.dropna(axis=1, how='all') # Nettoyer la colonne 'Logbook_name' en enlevant les espaces et les ':' df_report.iloc[:, 0] = df_report.iloc[:, 0].str.replace(':', '').str.strip() if df_report.shape[1] < 2: # Renommer les colonnes df_report.columns = ['Logbook_name'] # Appliquer un filtre pour les caractères spéciaux dans la colonne 'Logbook_name' df_report['Logbook_name'] = common_functions.remove_spec_char_from_list(df_report['Logbook_name']) # Supprimer les espaces supplémentaires dans la colonne 'Logbook_name' df_report['Logbook_name'] = df_report['Logbook_name'].str.strip() df_report['Value'] = pd.NA else: # Nettoyer la colonne 'Value' en appliquant strip() si l'élément correspond à une chaîne de caractères df_report.iloc[:, 1] = df_report.iloc[:, 1].apply(lambda x: x.strip() if isinstance(x, str) else x) # Renommer les colonnes df_report.columns = ['Logbook_name', 'Value'] # Appliquer un filtre pour les caractères spéciaux dans la colonne 'Logbook_name' df_report['Logbook_name'] = common_functions.remove_spec_char_from_list(df_report['Logbook_name']) # Supprimer les espaces supplémentaires dans la colonne 'Logbook_name' df_report['Logbook_name'] = df_report['Logbook_name'].str.strip() return df_report
[docs] def extract_gear_info(df_donnees, version): """ Extraction des cases 'Gear' Args: df_donnees (df): excel p1 Returns: df """ if version == "ll_17.6": df_gear = df_donnees.iloc[12:16, 11:21] elif version == "ll_26": df_gear = df_donnees.iloc[12:16, 9:19] # On supprimes les colonnes qui sont vides df_gear = common_functions.del_empty_col(df_gear) # Nettoyer la colonne 'Logbook_name' en enlevant les espaces et les ':' df_gear.iloc[:, 0] = df_gear.iloc[:, 0].str.replace(':', '').str.strip() # Nettoyer la colonne 'Value' en appliquant strip() si l'élément correspond à une chaîne de caractères if df_gear.shape[1] != 1: df_gear.iloc[:, 1] = df_gear.iloc[:, 1].apply(lambda x: x.strip() if isinstance(x, str) else x) else: df_gear['Value'] = None # Renommer les colonnes df_gear.columns = ['Logbook_name', 'Value'] # Appliquer un filtre pour les caractères spéciaux dans la colonne 'Logbook_name' df_gear['Logbook_name'] = common_functions.remove_spec_char_from_list(df_gear['Logbook_name']) # Supprimer les espaces supplémentaires dans la colonne 'Logbook_name' df_gear['Logbook_name'] = df_gear['Logbook_name'].str.strip() # On vérifie que les données du excel sont des entiers toutes_int = df_gear['Value'].apply(lambda cellule: isinstance(cellule, int)).all() if toutes_int: # Applique la fonction vect si toutes les cellules sont des entiers df_gear['Value'] = np.vectorize(common_functions.strip_if_string)(df_gear['Value']) if not df_gear['Value'].apply(lambda x: isinstance(x, int)).all(): message = _("Les données remplies dans le fichier soumis ne correspondent pas au type de données attendues. Ici on attend seulement des entiers.") return df_gear, message return df_gear
[docs] def extract_line_material_v17(df_donnees): """ Extraction des cases 'Gear' selon la version du logbook v.17.6 de 2024 Args: df_donnees (df): excel p1 Returns: df """ # On extrait les données df_line = df_donnees.iloc[12:16, 21:29] # On supprimes les colonnes qui sont vides df_line = common_functions.del_empty_col(df_line) # Nettoyer la colonne 'Logbook_name' en enlevant les espaces et les ':' df_line.iloc[:, 0] = df_line.iloc[:, 0].str.replace(':', '').str.strip() # Nettoyer la colonne 'Value' en appliquant strip() si l'élément correspond à une chaîne de caractères df_line.iloc[:, 1] = df_line.iloc[:, 1].apply(lambda x: x.strip() if isinstance(x, str) else x) # Renommer les colonnes df_line.columns = ['Logbook_name', 'Value'] # Appliquer un filtre pour les caractères spéciaux dans la colonne 'Logbook_name' df_line['Logbook_name'] = common_functions.remove_spec_char_from_list(df_line['Logbook_name']) # Supprimer les espaces supplémentaires dans la colonne 'Logbook_name' df_line['Logbook_name'] = df_line['Logbook_name'].str.strip() df_line['Value'] = df_line['Value'].str.strip() # Filtrer les lignes qui sont cochées df_line_used = df_line[(df_line["Value"] != "None") & (df_line["Value"].notna())] if len(df_line_used) > 1: message = _("Ici on n'attend qu'un seul matériau. Veuillez vérifier les données.") return df_line_used if len(df_line_used) == 0: message = _("La table entre les lignes 13 à 16 de la colonne 'AC' ne sont pas saisies. Veuillez vérifier les données.") return df_line_used, message return df_line_used
[docs] def extract_line_material_v26(df_donnees): """ Extraction des cases 'Gear' selon la version du logbook v.18 de 2026 Args: df_donnees (df): excel p1 Returns: df """ df_line = df_donnees.iloc[12:16, 19:27] df_line = common_functions.del_empty_col(df_line) # Nettoyer la colonne 'Logbook_name' en enlevant les espaces et les ':' df_line.iloc[:, 0] = df_line.iloc[:, 0].str.replace(':', '').str.strip() # Nettoyer la colonne 'Value' en appliquant strip() si l'élément correspond à une chaîne de caractères if df_line.shape[1] != 1: df_line.iloc[:, 1] = df_line.iloc[:, 1].apply(lambda x: x.strip() if isinstance(x, str) else x) else: df_line['Value'] = None # Renommer les colonnes df_line.columns = ['Logbook_name', 'Value'] # Appliquer un filtre pour les caractères spéciaux dans la colonne 'Logbook_name' df_line['Logbook_name'] = common_functions.remove_spec_char_from_list(df_line['Logbook_name']) df_line['Value'] = common_functions.remove_spec_char_from_list(df_line['Value']) # Supprimer les espaces supplémentaires dans la colonne 'Logbook_name' df_line['Logbook_name'] = df_line['Logbook_name'].str.strip() df_line['Value'] = df_line['Value'].str.strip() return df_line
[docs] def extract_target_species(df_donnees, version): """ Extraction des cases 'Target species' Args: df_donnees (df): excel p1 Returns: df """ if version == "ll_17.6": df_target = df_donnees.iloc[12:16, 29:34] elif version == "ll_26": df_target = df_donnees.iloc[12:16, 27:32] # On supprimes les colonnes qui sont vides df_target = common_functions.del_empty_col(df_target) # Nettoyer la colonne 'Logbook_name' en enlevant les espaces et les ':' df_target.iloc[:, 0] = df_target.iloc[:, 0].str.replace(':', '').str.strip() # Nettoyer la colonne 'Value' en appliquant strip() si l'élément correspond à une chaîne de caractères # df_target.iloc[:, 1] = df_target.iloc[:, 1].apply(lambda x: x.strip() if isinstance(x, str) and pd.notna(x) else x) if df_target.shape[1] > 1: df_target.iloc[:, 1] = df_target.iloc[:, 1].apply(lambda x: x.strip() if isinstance(x, str) else x) else: # Pas de colonne "Value" → aucun target bait coché return pd.DataFrame(columns=["Logbook_name"]) # Renommer les colonnes df_target.columns = ['Logbook_name', 'Value'] # Appliquer un filtre pour les caractères spéciaux dans la colonne 'Logbook_name' df_target['Logbook_name'] = common_functions.remove_spec_char_from_list(df_target['Logbook_name']) # Supprimer les espaces supplémentaires dans la colonne 'Logbook_name' df_target['Logbook_name'] = df_target['Logbook_name'].str.strip() # Filtrer les lignes qui sont cochées df_target_used = pd.DataFrame() for index, row in df_target.iterrows(): if row['Value'] is not None: df_target_used.loc[len(df_target_used), 'Logbook_name'] = df_target.loc[index, 'Logbook_name'] return df_target_used
[docs] def extract_logbook_date(df_donnees, version): """ Extraction des cases relatives au mois et à l'année du logbook Args: df_donnees (df): excel p1 Returns: df """ # On extrait les données propres au 'Vessel information' df_month = df_donnees.iloc[17, 5] if version == "ll_17.6": df_year = df_donnees.iloc[17, 11] elif version == "ll_26": df_year = df_donnees.iloc[17, 8] month = int(df_month) if isinstance(df_month, (int, float)) and not pd.isna(df_month) else 0 year = int(df_year) if isinstance(df_year, (int, float)) and not pd.isna(df_year) else 0 date = {'Logbook_name': ['Month', 'Year'], 'Value': [int(month), int(year)]} df_date = pd.DataFrame(date) df_date['Logbook_name'] = common_functions.remove_spec_char_from_list(df_date['Logbook_name']) return df_date
[docs] def extract_bait_v17(df_donnees): """ Extraction des cases relatives au type d'appât utilisé selon la version du logbook v.17.6 de 2024 Args: df_donnees (df): excel p1 Returns: df """ # On extrait les données df_squid = df_donnees.iloc[19, 16] df_sardine = df_donnees.iloc[19, 20] df_mackerel = df_donnees.iloc[19, 24] df_muroaji = df_donnees.iloc[19, 28] df_other = df_donnees.iloc[19, 32] bait = {'Logbook_name': ['Squid', 'Sardine', 'Mackerel', 'Muroaji', 'Other'], 'Value': [df_squid, df_sardine, df_mackerel, df_muroaji, df_other]} df_bait = pd.DataFrame(bait) # Filtrer les lignes qui sont cochées df_bait_used = pd.DataFrame() for index, row in df_bait.iterrows(): if row['Value'] is not None: df_bait_used.loc[len(df_bait_used), 'Logbook_name'] = df_bait.loc[index, 'Logbook_name'] return df_bait_used
[docs] def extract_bait_v26(df_donnees): """ Extraction des cases relatives au type d'appât utilisé selon la version du logbook v.18 de 2025 Args: df_donnees (df): excel p1 Returns: df """ df_bait = df_donnees.iloc[23:54, 40:41].copy() df_bait.columns = ['Bait'] # Nettoyage df_bait['Bait'] = df_bait['Bait'].apply( lambda x: "".join(common_functions.remove_spec_char_from_list(x.strip())) if isinstance(x, str) and x.strip() != "" else x) return df_bait
[docs] def extract_positions(df_donnees, version): """ Extraction des cases relatives aux données de position Args: df_donnees (df): excel p1 Returns: df """ if version == "ll_17.6": data = df_donnees.iloc[24:55, :7] elif version == "ll_26": data = df_donnees.iloc[23:54, :7] colnames = ['Day', 'Latitude_Degrees', 'Latitude_Minutes', 'Latitude_Direction', 'Longitude_Degrees', 'Longitude_Minutes', 'Longitude_Direction'] data.columns = colnames # On converti les données de position en degrés décimal data['Latitude'] = common_functions.dms_to_decimal(data['Latitude_Degrees'], data['Latitude_Minutes'], data['Latitude_Direction']) data['Longitude'] = common_functions.dms_to_decimal(data['Longitude_Degrees'], data['Longitude_Minutes'], data['Longitude_Direction']) # Supprimer les lignes avec des valeurs nulles et conserver les colonnes d'intérêt data = data.dropna(subset=['Latitude', 'Longitude']) df_position = data[['Latitude', 'Longitude']] df_position.reset_index(drop=True, inplace=True) return df_position
[docs] def get_vessel_activity_topiaid_v17(startTimeStamp, allData): """ Fonction qui prend en argument une heure de depart et qui donne un topiaID de VesselActivity en fonction du type et du contenu de l'entrée Args: startTimeStamp (date): information horaire - si type date alors Fishing operation, sinon on regarde le texte dans la cellule allData (json): données de références Returns: topiaID de l'activité détectée """ if ":" in str(startTimeStamp): code = "FO" elif re.findall(r"[0-9]{4}", startTimeStamp): code = "FO" elif re.findall("cruis", startTimeStamp.lower()): code = "CRUISE" elif re.findall("crus", startTimeStamp.lower()): code = "CRUISE" elif re.findall("port", startTimeStamp.lower()): code = "PORT" elif startTimeStamp is None: return None else: code = "OTH" vessel_activities = allData["VesselActivity"]["longline"] for vessel_activity in vessel_activities: if vessel_activity.get("code") == code: return vessel_activity["topiaId"], vessel_activity["label1"] return None
[docs] def get_vessel_activity_topiaid_v26(df_donnees, allData): """ Fonction qui prend en argument une heure de depart et qui donne un topiaID de VesselActivity en fonction du type et du contenu de l'entrée Args: startTimeStamp (date): information horaire - si type date alors Fishing operation, sinon on regarde le texte dans la cellule allData (json): données de références Returns: topiaID de l'activité détectée """ if pd.isna(df_donnees): # If the vessel activity is not filled vessel_activity = "fr.ird.referential.ll.common.VesselActivity#666#07" else : data_clean = df_donnees.strip().lower() if re.findall("cru", data_clean): vessel_activity = "fr.ird.referential.ll.common.VesselActivity#666#01" elif re.findall("fis", data_clean): vessel_activity = "fr.ird.referential.ll.common.VesselActivity#1239832686138#0.1" elif re.findall("out", data_clean) or re.findall("ose", data_clean): vessel_activity = "fr.ird.referential.ll.common.VesselActivity#666#04" elif re.findall("por", data_clean): vessel_activity = "fr.ird.referential.ll.common.VesselActivity#666#03" elif re.findall("tra", data_clean): vessel_activity = "fr.ird.referential.ll.common.VesselActivity#666#06" else: # OTHER vessel_activity = "fr.ird.referential.ll.common.VesselActivity#1239832686138#0.2" vessel_activities = allData["VesselActivity"]["longline"] # Chercher dans la liste vessel_activity_match = next( (v for v in vessel_activities if v["topiaId"] == vessel_activity), None # valeur de retour si non trouvé ) return vessel_activity_match["topiaId"], vessel_activity_match["label1"]
[docs] def extract_time(df_donnees, allData, version): """ Extraction des cases relatives aux horaires des coups de pêche Args: df_donnees (df): excel p1 Returns: df: type horaire, sauf si le bateau est en mouvement """ if version == "ll_17.6": day = df_donnees.iloc[24:55, 0] df_time = df_donnees.iloc[24:55, 7:8] elif version == "ll_26": day = df_donnees.iloc[23:54, 0] df_vesselactivity = df_donnees.iloc[23:54, 7:8] df_time = df_donnees.iloc[23:54, 8:9] colnames = ['Time'] df_time.columns = colnames df_time['Time'] = df_time['Time'].apply(common_functions.convert_to_time_or_text) df_time.reset_index(drop=True, inplace=True) if version == "ll_17.6": vessel_activities = np.empty((len(day), 2), dtype=object) for ligne in range(len(day)): vessel_activity = get_vessel_activity_topiaid_v17( df_time.iloc[ligne]['Time'], allData) vessel_activities[ligne, 0] = vessel_activity[1] vessel_activities[ligne, 1] = vessel_activity[0] elif version == "ll_26": vessel_activities = np.empty((len(day), 2), dtype=object) for ligne in range(len(day)): vessel_activity = get_vessel_activity_topiaid_v26(df_vesselactivity.iloc[ligne].iloc[0], allData) vessel_activities[ligne, 0] = vessel_activity[1] vessel_activities[ligne, 1] = vessel_activity[0] np_time = np.column_stack((day, df_time, vessel_activities )) df_time = pd.DataFrame(np_time, columns=['Day', 'Time', 'VesselActivity', 'VesselActivity_topiaId']) return df_time
[docs] def extract_temperature(df_donnees, version): """ Extraction des cases relatives aux températures Args: df_donnees (df): excel p1 Returns: df """ if version == "ll_17.6": df_temp = df_donnees.iloc[24:55, 8:9] elif version == "ll_26": df_temp = df_donnees.iloc[23:54, 9:10] colnames = ['Temperature'] df_temp.columns = colnames df_temp.reset_index(drop=True, inplace=True) return df_temp
[docs] def extract_fishing_effort(df_donnees, version): """ Extraction des cases relatives aux efforts de pêche Args: df_donnees (df): excel p1 Returns: df """ if version == "ll_17.6": df_fishing_effort = df_donnees.iloc[24:55, [0, 9, 10, 11]].copy() elif version == "ll_26": df_fishing_effort = df_donnees.iloc[23:54, [0, 9, 10, 11]].copy() df_fishing_effort.columns = ['Day', 'Hooks per basket', 'Total hooks', 'Total lightsticks'] try: df_fishing_effort['Total hooks / Hooks per basket'] = common_functions.convert_to_int(df_fishing_effort['Total hooks']) / common_functions.convert_to_int(df_fishing_effort['Hooks per basket']) except TypeError: df_fishing_effort['Total hooks / Hooks per basket'] = None df_fishing_effort.reset_index(drop=True, inplace=True) return df_fishing_effort
[docs] def extract_fish_p1_v17(df_donnees): """ Extraction des cases relatives à ce qui a été pêché Args: df_donnees (df): excel p1 Returns: df """ df_fishes = df_donnees.iloc[24:55, 12:36] colnames = ['No. RET SBF', 'Kg RET SBF', 'No. RET ALB', 'Kg RET ALB', 'No. RET BET', 'Kg RET BET', 'No. RET YFT', 'Kg RET YFT', 'No. RET SWO', 'Kg RET SWO', 'No. RET MLS', 'Kg RET MLS', 'No. RET BUM', 'Kg RET BUM', 'No. RET BLM', 'Kg RET BLM', 'No. RET SFA', 'Kg RET SFA', 'No. RET SSP', 'Kg RET SSP', 'No. RET OIL', 'Kg RET OIL', 'No. RET MZZ', 'Kg RET MZZ'] df_fishes.columns = colnames df_fishes = df_fishes.map(common_functions.zero_if_empty) df_fishes.reset_index(drop=True, inplace=True) return df_fishes
[docs] def extract_bycatch_p2_v17(df_donnees): """ Extraction des cases relatives à ce qui a été pêché mais accessoires Args: df_donnees (df): excel p2 Returns: df """ df_bycatch = df_donnees.iloc[15:46, 1:39] colnames = ['No. RET FAL', 'Kg RET FAL', 'No. ESC FAL', 'No. DIS FAL', 'No. RET BSH', 'Kg RET BSH', 'No. ESC BSH', 'No. DIS BSH', 'No. RET MAK', 'Kg RET MAK', 'No. ESC MAK', 'No. DIS MAK', 'No. RET MSK', 'Kg RET MSK', 'No. ESC MSK', 'No. DIS MSK', 'No. RET SPN', 'Kg RET SPN', 'No. ESC SPN', 'No. DIS SPN', 'No. RET TIG', 'Kg RET TIG', 'No. ESC TIG', 'No. DIS TIG', 'No. RET PSK', 'Kg RET PSK', 'No. ESC PSK', 'No. DIS PSK', 'No. ESC THR', 'No. DIS THR', 'No. ESC OCS', 'No. DIS OCS', 'No. ESC MAM', 'No. DIS MAM', 'No. ESC SBD', 'No. DIS SBD', 'No. ESC TTX', 'No. DIS TTX'] df_bycatch.columns = colnames df_bycatch = df_bycatch.map(common_functions.zero_if_empty) df_bycatch.reset_index(drop=True, inplace=True) return df_bycatch
[docs] def clean_value(x): if x is None: return None if isinstance(x, pd.Series): return None if x.isna().all() or (x == 0).all() else x return None if pd.isna(x) or x == '' or x == 0 else x
[docs] def extract_catches_v26(df_donnees, version): if version == "ll_17.6": df_fishes = df_donnees.iloc[22:55, 12:36] elif version == "ll_26": df_fishes = df_donnees.iloc[21:54, 13:39] df_fishes.columns = df_fishes.iloc[0] df_fishes = df_fishes.iloc[2:].reset_index(drop=True) fishes = [] cols = list(df_fishes.columns) for _, row in df_fishes.iterrows(): fishes_row = [] # 👈 liste pour UNE ligne for i in range(0, len(cols), 2): col_no = cols[i] col_kg = cols[i + 1] print("°"*10, col_no, "//", col_kg, "°"*10) # récupérer le code espèce (SBF, ALB, BET…) species_match = re.search(r'\(([^)]+)\)', col_no) species_code = species_match.group(1) if species_match else None # specie_process = common_functions.remove_spec_char_from_list(col_kg) # utf_8 = col_kg.encode('utf-8') # specie_process = re.search(r'[^a-zA-Z]', utf_8) process_match = re.search(r'\b([A-Z]{2})\b\s*$', col_kg) specie_process = process_match.group(1).strip() if process_match else None print(">>"+ specie_process+ "<<") print("row ::: ", type(row), "::: fin row") count = row[col_no] kg = row[i + 1] print("count :: ", count) print("kg ::: ", kg) # if pd.isna(count): # print("coucou na") # count = None # normalisation des vides # count = None if pd.isna(count) or count == '' or count == 0 else count # if count is None or pd.isna(count) or (count == 0).all() or count == '': # count = None # if pd.isna(kg).all(): # print("coucou na kg") # kg = None kg = clean_value(kg) count = clean_value(count) # kg = None if pd.isna(kg).all() or kg.all() == '' or kg.all() == 0 else kg # kg = None if pd.isna(kg) or kg == '' or kg == 0 else kg # garder uniquement si au moins un est renseigné if count is None and kg is None: continue fishes_row.append({ "species": species_code, "onBoardProcessing": specie_process, "catchFate" : 'RET', "discardHealthStatus": None, "count": count, "kg": kg }) fishes.append(fishes_row) return fishes
[docs] def extract_bycatch_page2_v26(df_donnees): df_bycatches = df_donnees.iloc[11:46, 1:41] df_bycatches_columns = df_bycatches.iloc[0:2] df_bycatches = df_bycatches.iloc[4:].reset_index(drop=True) bycatches = [] n_cols = df_bycatches.shape[1] for _, row in df_bycatches.iterrows(): bycatches_row = [] for i in range(0, n_cols, 4): col_species = df_bycatches_columns.iloc[0, i] col_process = df_bycatches_columns.iloc[1, i] # récupérer le code espèce (SBF, ALB, BET…) species_match = re.search(r'\(([^)]+)\)', col_species) species_code = species_match.group(1) if species_match else None # Récupérer le code de processing (HG, GG...) process_match = re.search(r'\b([A-Z]{2})\b\s*$', col_process) specie_process = process_match.group(1) if process_match else None count_ret = row.iloc[i] kg_ret = row.iloc[i+1] count_R_A = row.iloc[i+2] count_R_D = row.iloc[i+3] # normalisation des vides count_ret = None if pd.isna(count_ret) or count_ret == '' else count_ret kg_ret = None if pd.isna(kg_ret) or kg_ret == '' else kg_ret count_R_A = None if pd.isna(count_R_A) or count_R_A == '' else count_R_A count_R_D = None if pd.isna(count_R_D) or count_R_D == '' else count_R_D # garder uniquement si au moins un est renseigné if all(v in (None, 0, '') for v in [count_ret, kg_ret, count_R_A, count_R_D]): continue # ---------- RET ---------- if count_ret is not None or kg_ret is not None: bycatches_row.append({ "species": species_code, "onBoardProcessing": specie_process, "catchFate": "RET", "discardHealthStatus": None, "count": count_ret, "kg": kg_ret }) # ---------- DIS / A ---------- if count_R_A is not None: bycatches_row.append({ "species": species_code, "onBoardProcessing": specie_process, "catchFate": "DIS", "discardHealthStatus": "A", "count": count_R_A, "kg": None }) # ---------- DIS / D ---------- if count_R_D is not None: bycatches_row.append({ "species": species_code, "onBoardProcessing": specie_process, "catchFate": "DIS", "discardHealthStatus": "D", "count": count_R_D, "kg": None }) bycatches.append(bycatches_row) return bycatches
[docs] def extract_bycatch_page3_v26(df_donnees, df_ref): """ Extraction des captures accessoires page 3 du logbook version 2026. Args: df_donnees (df): feuille page 3 df_ref (df): feuille page 4 (références) Returns: list of dict: captures par ligne """ # ------------------------- # Préparation des données # ------------------------- df_bycatches = df_donnees.iloc[11:46, 1:27] df_bycatches_columns = df_bycatches.iloc[0:2] df_bycatches = df_bycatches.iloc[4:].reset_index(drop=True) df_ref_bycatch = ref_table_bycatch(df_ref) if 'CODE 代碼' in df_ref_bycatch.columns and 'NAME_EN 英文名' in df_ref_bycatch.columns: df_ref_bycatch = df_ref_bycatch.rename(columns={'CODE 代碼': 'CODE', 'NAME_EN 英文名': 'NAME_EN'}) else: df_ref_bycatch.columns = df_ref_bycatch.columns.str.strip() df_ref_bycatch['NAME_EN'] = df_ref_bycatch['NAME_EN'].str.strip() bycatches = [] n_cols = df_bycatches.shape[1] def clean(v): return None if pd.isna(v) or v in ('', 0) else v for _, row in df_bycatches.iterrows(): bycatches_row = [] # ------------------------- # Colonnes SKH : RET + DIS A / DIS D # ------------------------- col_species = df_bycatches_columns.iloc[0, 0] col_process = df_bycatches_columns.iloc[1, 0] species_match = re.search(r'\(([^)]+)\)', str(col_species)) species_code = species_match.group(1) if species_match else col_species process_match = re.search(r'\b([A-Z]{2})\b\s*$', str(col_process)) specie_process = process_match.group(1) if process_match else None count_ret = clean(row.iloc[0]) kg_ret = clean(row.iloc[1]) count_DIS_A = clean(row.iloc[2]) count_DIS_D = clean(row.iloc[3]) if count_ret is not None or kg_ret is not None: bycatches_row.append({ "species": species_code, "onBoardProcessing": specie_process, "catchFate": "RET", "discardHealthStatus": None, "count": count_ret, "kg": kg_ret }) if count_DIS_A is not None: bycatches_row.append({ "species": species_code, "onBoardProcessing": specie_process, "catchFate": "DIS", "discardHealthStatus": "A", "count": count_DIS_A, "kg": None }) if count_DIS_D is not None: bycatches_row.append({ "species": species_code, "onBoardProcessing": specie_process, "catchFate": "DIS", "discardHealthStatus": "D", "count": count_DIS_D, "kg": None }) # ------------------------- # Colonnes : DIS A / DIS D # ------------------------- i = 4 while i < 12: col_species = df_bycatches_columns.iloc[0, i] col_process = df_bycatches_columns.iloc[1, i] species_match = re.search(r'\(([^)]+)\)', str(col_species)) species_code = species_match.group(1) if species_match else col_species process_match = re.search(r'\b([A-Z]{2})\b\s*$', str(col_process)) specie_process = process_match.group(1) if process_match else None v0 = clean(row.iloc[i]) v1 = clean(row.iloc[i + 1]) if i + 1 < n_cols else None if v0 is not None: bycatches_row.append({ "species": species_code, "onBoardProcessing": specie_process, "catchFate": "DIS", "discardHealthStatus": "A", "count": v0, "kg": None }) if v1 is not None: bycatches_row.append({ "species": species_code, "onBoardProcessing": specie_process, "catchFate": "DIS", "discardHealthStatus": "D", "count": v1, "kg": None }) i += 2 # ------------------------- # Colonnes : species (str), DIS A, DIS D # ------------------------- i = 12 while i < 24: specie_name = row.iloc[i] count_RA = clean(row.iloc[i + 1]) if i + 1 < n_cols else None count_RD = clean(row.iloc[i + 2]) if i + 2 < n_cols else None if pd.notna(specie_name) and isinstance(specie_name, str): if len(specie_name) >= 4: species_code = df_ref_bycatch.loc[df_ref_bycatch['NAME_EN'] == specie_name, 'CODE'].values[0] else: species_code=specie_name if count_RA is not None: bycatches_row.append({ "species": species_code, "onBoardProcessing": None, "catchFate": "DIS", "discardHealthStatus": "A", "count": count_RA, "kg": None }) if count_RD is not None: bycatches_row.append({ "species": species_code, "onBoardProcessing": None, "catchFate": "DIS", "discardHealthStatus": "D", "count": count_RD, "kg": None }) i += 3 # ------------------------- # LAST Colonnes : species == WHALE SHARK, DIS A, DIS D # ------------------------- i = 24 while i < n_cols: species_code = "RHN" count_RA = clean(row.iloc[i + 1]) if i + 1 < n_cols else None count_RD = clean(row.iloc[i + 2]) if i + 2 < n_cols else None if count_RA is not None: bycatches_row.append({ "species": species_code, "onBoardProcessing": None, "catchFate": "DIS", "discardHealthStatus": "A", "count": count_RA, "kg": None }) if count_RD is not None: bycatches_row.append({ "species": species_code, "onBoardProcessing": None, "catchFate": "DIS", "discardHealthStatus": "D", "count": count_RD, "kg": None }) i += 2 bycatches.append(bycatches_row) return bycatches
[docs] def extract_seabirds_ref(df_donnees): """ Extraction des cases de références relatives aux espèces d'oiseaux marins Args: df_donnees (df): excel p4 du logbook version 2026 Returns: df """ df_seabirds = df_donnees.iloc[2:34, 5:8].copy() # La première ligne de df_seabirds devient le nom des colonnes df_seabirds.columns = df_seabirds.iloc[0] # Supprimer la première ligne df_seabirds = df_seabirds[1:].reset_index(drop=True) # Optionnel : nettoyer les noms de colonnes df_seabirds.columns = [str(c).strip() for c in df_seabirds.columns] return df_seabirds
[docs] def extract_marineturtles_ref(df_donnees): """ Extraction des cases de références relatives aux espèces de tortues marines Args: df_donnees (df): excel p4 du logbook version 2026 Returns: df """ df_marineturtles = df_donnees.iloc[43:51, 1:4].copy() # La première ligne de df_seabirds devient le nom des colonnes df_marineturtles.columns = df_marineturtles.iloc[0] df_marineturtles.columns = df_marineturtles.columns.str.strip() # Supprimer la première ligne df_marineturtles = df_marineturtles[1:].reset_index(drop=True) return df_marineturtles
[docs] def extract_rays_ref(df_donnees): """ Extraction des cases de références relatives aux espèces de raies Args: df_donnees (df): excel p4 du logbook version 2026 Returns: df """ df_rays = df_donnees.iloc[53:70, 1:4].copy() # La première ligne de df_rays devient le nom des colonnes df_rays.columns = df_rays.iloc[0] df_rays.columns = df_rays.columns.str.strip() # Supprimer la première ligne df_rays = df_rays[1:].reset_index(drop=True) return df_rays
[docs] def extract_cetaceans_ref(df_donnees): """ Extraction des cases de références relatives aux espèces de cetacés Args: df_donnees (df): excel p4 du logbook version 2026 Returns: df """ df_cetaceans = df_donnees.iloc[36:82, 5:8].copy() # La première ligne de df_cetaceans devient le nom des colonnes df_cetaceans.columns = df_cetaceans.iloc[0] df_cetaceans.columns = df_cetaceans.columns.str.strip() # Supprimer la première ligne df_cetaceans = df_cetaceans[1:].reset_index(drop=True) return df_cetaceans
[docs] def extract_baits_ref(df_donnees): """ Extraction des cases de références relatives aux appâts Args: df_donnees (df): excel p4 du logbook version 2026 Returns: df """ df_ref_baits = df_donnees.iloc[2:7, 9:11].copy() # La première ligne de df_ref_baits devient le nom des colonnes df_ref_baits.columns = df_ref_baits.iloc[0] # Supprimer la première ligne df_ref_baits = df_ref_baits[1:].reset_index(drop=True) if 'CODE 代碼' in df_ref_baits.columns: df_ref_baits = df_ref_baits.rename(columns={'CODE 代碼': 'CODE'}) else: df_ref_baits.columns = df_ref_baits.columns.str.strip() return df_ref_baits
[docs] def ref_table_bycatch(df_donnees): """ Extraction des cases de références relatives aux captures accessoires Args: df_donnees (df): excel p4 du logbook version 2026 Returns: df """ df_ref_bycatch = pd.concat([ extract_cetaceans_ref(df_donnees), extract_rays_ref(df_donnees), extract_marineturtles_ref(df_donnees), extract_seabirds_ref(df_donnees) ], ignore_index=True) return df_ref_bycatch
[docs] def extract_material_ref(df_donnees): """ Extraction des cases de références relatives aux matériels Args: df_donnees (df): excel p4 du logbook version 2026 Returns: df """ df_ref_materials = df_donnees.iloc[23:28, 9:11].copy() # La première ligne de df_ref_materials devient le nom des colonnes df_ref_materials.columns = df_ref_materials.iloc[0] # Supprimer la première ligne df_ref_materials = df_ref_materials[1:].reset_index(drop=True) if 'CODE 代碼' in df_ref_materials.columns: df_ref_materials = df_ref_materials.rename(columns={'CODE 代碼': 'CODE'}) else: df_ref_materials.columns = df_ref_materials.columns.str.strip() df_ref_materials['CODE'] = df_ref_materials['CODE'].str.strip() mappingLine_v26 = { "N": "MUN", "NB": "BRL", "NM": "MON", "OTH": "UNK"} df_ref_materials["Code_v26"] = df_ref_materials["CODE"].map(mappingLine_v26) return df_ref_materials