Extraer datos de consumo de combustibles para generación eléctrica¶

La ejecución de este código utiliza los datos de los informes mensuales de datos operativos eléctricos de la Compañía Administradora del Mercado Mayorista Eléctrico (CAMMESA) de Argentina, y los agrupa en un único archivo CSV y filtra duplicados.

Dentro de los mismos consta un resumen mensual que incluye detalles de consumo de combustibles para generación eléctrica, por central, máquina, agente, tipo de máquina, fuente de generación, tecnología, y tipo de combustible.

Unidad: GAS NATURAL [Dam3] - FUEL OIL [Ton] - GAS OIL [M3] - CARBÓN MINERAL [Ton] - BIODIESEL [Ton]

# Cargar librerías
import zipfile
import numpy as np
import pandas as pd
from datetime import date, datetime
from dateutil.rrule import rrule, MONTHLY
from os import remove, path

# Definir fechas
sdate = date(2016, 1, 1)   # fecha de inicio (año, mes, día)
edate = date(2016, 5, 1)   # fecha de fin (año, mes, día)

# Definir valores a extraer
filePath = './CAMMESA'
fileName = 'BASE_INFORME_MENSUAL_'
sheetName = 'COMBUSTIBLES'
headerRow = 22
columns = 'A:K' 
columnNames = ['AÑO', 'MES', 'MAQUINA', 'CENTRAL', 'AGENTE', 'AGENTE_DESCRIPCION',
               'TIPO_MAQUINA', 'FUENTE_GENERACION', 'TECNOLOGIA', 'COMBUSTIBLE', 'CONSUMO']

outFile = 'Combustibles.csv' #nombre del archivo de destino

#Funciones de fecha
def pad_date(month_num):
    month = str(month_num)
    if len(month) == 1:
        month = '0' + month
    return month

def month_iter(start_month, start_year, end_month, end_year):
    start = datetime(start_year, start_month, 1)
    end = datetime(end_year, end_month, 1)

    return ((pad_date(d.month), pad_date(d.year)) for d in rrule(MONTHLY, dtstart=start, until=end))

# Descomprimir datos
for (month, year) in month_iter(sdate.month, sdate.year, edate.month, edate.year):
    nameComponents = [year, month]  
    inFile = filePath + '/' + fileName + '-'.join(nameComponents) + '.zip'

    if path.isfile(inFile):
        print('Extracting ' + inFile)
        with zipfile.ZipFile(inFile) as zf:
            zf.extractall(filePath)
    else:
        print('ERROR: ' + inFile + " does not exist")

ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-01.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-02.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-03.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-04.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-05.zip does not exist

# Borrar archivos zip
for (month, year) in month_iter(sdate.month, sdate.year, edate.month, edate.year):
    nameComponents = [year, month]  
    inFile = filePath + '/' + fileName + '-'.join(nameComponents) + '.zip'
    
    if path.isfile(inFile):
        print('Deleting ' + inFile)
        remove(inFile)
    else:
        print('ERROR: ' + inFile + " does not exist")

ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-01.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-02.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-03.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-04.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-05.zip does not exist

# Unificar datos en un DataFrame
out = pd.DataFrame()

for (month,year) in month_iter(sdate.month, sdate.year, edate.month, edate.year):
    nameComponents = [year, month]  
    inFile = filePath + '/' + fileName + '-'.join(nameComponents) + '.xlsx'
    
    if path.isfile(inFile):
        print('Concatenating ' + inFile)
        df = pd.read_excel(inFile, sheet_name = sheetName, header = headerRow, usecols = columns)
        df.columns = columnNames
        out = pd.concat([out, df])
    else:
        print('ERROR: ' + inFile + " does not exist")

Concatenating ./CAMMESA/BASE_INFORME_MENSUAL_2016-01.xlsx
Concatenating ./CAMMESA/BASE_INFORME_MENSUAL_2016-02.xlsx
Concatenating ./CAMMESA/BASE_INFORME_MENSUAL_2016-03.xlsx
Concatenating ./CAMMESA/BASE_INFORME_MENSUAL_2016-04.xlsx
Concatenating ./CAMMESA/BASE_INFORME_MENSUAL_2016-05.xlsx

out.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52734 entries, 0 to 11160
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   AÑO                 52734 non-null  int64         
 1   MES                 52734 non-null  datetime64[ns]
 2   MAQUINA             52734 non-null  object        
 3   CENTRAL             52734 non-null  object        
 4   AGENTE              52734 non-null  object        
 5   AGENTE_DESCRIPCION  52734 non-null  object        
 6   TIPO_MAQUINA        52734 non-null  object        
 7   FUENTE_GENERACION   52734 non-null  object        
 8   TECNOLOGIA          52734 non-null  object        
 9   COMBUSTIBLE         52734 non-null  object        
 10  CONSUMO             52734 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(8)
memory usage: 4.8+ MB

#Drop duplicated rows
df = out.drop_duplicates(subset = columnNames, inplace = False)
df.reset_index(drop= True, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23030 entries, 0 to 23029
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   AÑO                 23030 non-null  int64         
 1   MES                 23030 non-null  datetime64[ns]
 2   MAQUINA             23030 non-null  object        
 3   CENTRAL             23030 non-null  object        
 4   AGENTE              23030 non-null  object        
 5   AGENTE_DESCRIPCION  23030 non-null  object        
 6   TIPO_MAQUINA        23030 non-null  object        
 7   FUENTE_GENERACION   23030 non-null  object        
 8   TECNOLOGIA          23030 non-null  object        
 9   COMBUSTIBLE         23030 non-null  object        
 10  CONSUMO             23030 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(8)
memory usage: 1.9+ MB

#Exporta la tabla
df.to_csv(outFile, index = False)