Extraer datos de balance de generación eléctrica

La ejecución de este código utiliza los datos de los informes mensuales de datos operativos eléctricos de la Compañía Administradora del Mercado Mayorista Eléctrico (CAMMESA) de Argentina, y los agrupa en un único archivo CSV y filtra duplicados.

Dentro de los mismos consta un resumen anual del balance (MWh) entre oferta y demanda, por tipo de fuente.

In [1]:
# Cargar librerías
import zipfile
import numpy as np
import pandas as pd
from datetime import date, datetime
from dateutil.rrule import rrule, MONTHLY
from os import remove, path
In [2]:
# Definir fechas
sdate = date(2016, 1, 1)   # fecha de inicio (año, mes, día)
edate = date(2016, 5, 1)   # fecha de fin (año, mes, día)
In [3]:
# Definir valores a extraer
filePath = './CAMMESA'
fileName = 'BASE_INFORME_MENSUAL_'
sheetName = 'BALANCE'
headerRow = 23 #should be one less than the row in excel
columns = 'A:E'
columnNames = ['AÑO', 'MES', 'BALANCE', 'TIPO', 'ENERGIA']

outFile = 'Balance.csv' #nombre del archivo de destino
In [4]:
#Funciones de fecha
def pad_date(month_num):
    month = str(month_num)
    if len(month) == 1:
        month = '0' + month
    return month

def month_iter(start_month, start_year, end_month, end_year):
    start = datetime(start_year, start_month, 1)
    end = datetime(end_year, end_month, 1)

    return ((pad_date(d.month), pad_date(d.year)) for d in rrule(MONTHLY, dtstart=start, until=end))
In [5]:
# Descomprimir datos
for (month, year) in month_iter(sdate.month, sdate.year, edate.month, edate.year):
    nameComponents = [year, month]  
    inFile = filePath + '/' + fileName + '-'.join(nameComponents) + '.zip'

    if path.isfile(inFile):
        print('Extracting ' + inFile)
        with zipfile.ZipFile(inFile) as zf:
            zf.extractall(filePath)
    else:
        print('ERROR: ' + inFile + " does not exist")
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-01.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-02.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-03.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-04.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-05.zip does not exist
In [6]:
# Borrar archivos zip
for (month, year) in month_iter(sdate.month, sdate.year, edate.month, edate.year):
    nameComponents = [year, month]  
    inFile = filePath + '/' + fileName + '-'.join(nameComponents) + '.zip'
    
    if path.isfile(inFile):
        print('Deleting ' + inFile)
        remove(inFile)
    else:
        print('ERROR: ' + inFile + " does not exist")
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-01.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-02.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-03.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-04.zip does not exist
ERROR: ./CAMMESA/BASE_INFORME_MENSUAL_2016-05.zip does not exist
In [7]:
# Unificar datos en un DataFrame
out = pd.DataFrame()

for (month,year) in month_iter(sdate.month, sdate.year, edate.month, edate.year):
    nameComponents = [year, month]  
    inFile = filePath + '/' + fileName + '-'.join(nameComponents) + '.xlsx'
    
    if path.isfile(inFile):
        print('Concatenating ' + inFile)
        df = pd.read_excel(inFile, sheet_name = sheetName, header = headerRow, usecols = columns)
        df.columns = columnNames
        out = pd.concat([out, df])
    else:
        print('ERROR: ' + inFile + " does not exist")
Concatenating ./CAMMESA/BASE_INFORME_MENSUAL_2016-01.xlsx
Concatenating ./CAMMESA/BASE_INFORME_MENSUAL_2016-02.xlsx
Concatenating ./CAMMESA/BASE_INFORME_MENSUAL_2016-03.xlsx
Concatenating ./CAMMESA/BASE_INFORME_MENSUAL_2016-04.xlsx
Concatenating ./CAMMESA/BASE_INFORME_MENSUAL_2016-05.xlsx
In [8]:
out.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1964 entries, 0 to 409
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   AÑO      1964 non-null   int64         
 1   MES      1964 non-null   datetime64[ns]
 2   BALANCE  1964 non-null   object        
 3   TIPO     1964 non-null   object        
 4   ENERGIA  1964 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 92.1+ KB
In [9]:
#Drop duplicated rows
df = out.drop_duplicates(subset = columnNames, inplace = False)
df.reset_index(drop= True, inplace = True)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 553 entries, 0 to 552
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   AÑO      553 non-null    int64         
 1   MES      553 non-null    datetime64[ns]
 2   BALANCE  553 non-null    object        
 3   TIPO     553 non-null    object        
 4   ENERGIA  553 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 21.7+ KB
In [10]:
#Exporta la tabla
df.to_csv(outFile, index = False)