Nombres premiers

def crible_eratosthene(n):
    """Génère une liste de booléens où True indique qu'un nombre est premier."""

    est_premier = [True] * (n + 1)
    est_premier[0] = est_premier[1] = False  # 0 et 1 ne sont pas premiers

    for i in range(2, int(n**0.5) + 1):
        if est_premier[i]:
            for multiple in range(i * i, n + 1, i):
                est_premier[multiple] = False

    return est_premier
l = crible_eratosthene(100000)

cpt = 0
for i in l[10000:]:
    if i:
        cpt += 1

print(cpt)
l[8363]

1 Fichier des communes

import pandas as pd
import duckdb

duckdb.sql("""
CREATE OR REPLACE VIEW commune AS
FROM 'https://www.insee.fr/fr/statistiques/fichier/7766585/v_commune_2024.csv';
""")
duckdb.sql("""
SELECT *
  FROM commune
""").to_df()
duckdb.sql("""
SELECT reg
  FROM commune
 WHERE NCC = 'MAMOUDZOU';
""").to_df()
duckdb.sql("""
SELECT COUNT(1)
  FROM commune
 WHERE dep = '973';
""").to_df()
duckdb.sql("""
SELECT COUNT(1)
  FROM commune
 WHERE dep IN ('02','60','80')
   AND NCC LIKE '%L%L%L%L%';
""").to_df()
duckdb.sql("""
SELECT dep,
       COUNT(1) AS nb_communes,
       RANK() OVER (ORDER BY COUNT(1) DESC) AS rang
  FROM commune
 WHERE dep IS NOT NULL
 GROUP BY dep
HAVING nb_communes >= 500
 ORDER BY 2 DESC;
""").to_df()
duckdb.sql("""
SELECT dep
  FROM commune
 WHERE LENGTH(ncc) = (SELECT max(LENGTH(ncc))
                        FROM commune);
""").to_df()

2 Polars

import os
import polars as pl

s3_endpoint = f'https://{os.environ["AWS_S3_ENDPOINT"]}'
s3_access_key = os.environ["AWS_ACCESS_KEY_ID"]
s3_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
s3_session_token = os.environ["AWS_SESSION_TOKEN"]
s3_region = os.environ["AWS_DEFAULT_REGION"]

s3_username = os.environ["VAULT_TOP_DIR"] # un peu bancal pour avoir le username
s3_file_path = f"s3://{s3_username}/ENSAI/SQL/tp3-longueur-fil.parquet"

storage_options = {
    "aws_endpoint":  s3_endpoint,
    "aws_access_key_id": s3_access_key,
    "aws_secret_access_key": s3_secret_access_key,
    "aws_token": s3_session_token,
    "aws_region": s3_region
  }

file = pl.scan_parquet(source=s3_file_path, 
                     storage_options=storage_options)
df = file.collect()
print(df)
df.filter(pl.col("vetement") == "gilet")

3 Diffusion

fichier_prenoms = pl.scan_parquet(source="s3://ldeneuville-ensai/diffusion/ENSAI/SQL-TP/prenoms-nat2022.parquet", 
                               storage_options=storage_options)
top10f2021 = fichier_prenoms\
    .filter((pl.col("sexe") == 2) & 
            (pl.col("annais") == "2021") & 
            (pl.col("preusuel") != "_PRENOMS_RARES"))\
    .group_by("preusuel")\
    .agg(pl.col("nombre").sum().alias("nombre_total"))\
    .sort("nombre_total", descending=True)\
    .limit(10)\
    .collect()
top10f2021
top10f2021_sql = fichier_prenoms\
    .sql("""
      SELECT preusuel, 
             SUM(nombre) AS nombre_total
        FROM self
       WHERE preusuel <> '_PRENOMS_RARES'
         AND annais = '2021'
         AND sexe = 2
       GROUP BY preusuel
       ORDER BY nombre_total DESC
       LIMIT 10
    """)\
    .collect()

top10f2021_sql
prenom = "Katia"
annee_debut, annee_fin = 1960, 1980

df_prenom_annee = fichier_prenoms\
    .filter((pl.col("preusuel") == prenom.upper()) & 
            (pl.col("annais").str.to_integer(strict=False).is_not_null()) &  # Convertir et exclure les nulls
            (pl.col("annais").str.to_integer(strict=False) >= annee_debut) & 
            (pl.col("annais").str.to_integer(strict=False) <= annee_fin))\
    .group_by("annais")\
    .agg(pl.col("nombre").sum().alias("nombre_total"))\
    .sort("annais")\
    .collect()

print(df_prenom_annee)
fichier_prenoms.sql(f"""
SELECT annais, 
       SUM(nombre) AS nombre_total
  FROM self
 WHERE preusuel = 'KATIA'
   AND annais != 'XXXX' 
   AND annais BETWEEN '{annee_debut}' AND '{annee_fin}'
 GROUP BY annais
 ORDER BY annais;
 """).collect()
import matplotlib.pyplot as plt
import seaborn as sns

# Set seaborn style for a cleaner, more appealing look
sns.set_theme(style="whitegrid")

# Create the plot
fig, ax = plt.subplots(figsize=(10, 6))

# Data
years = df_prenom_annee["annais"].to_list()         # Extracts years as a list
counts = df_prenom_annee["nombre_total"].to_list()  # Extracts counts as a list

# Bar plot with custom colors and transparency for better visuals
ax.bar(years, 
       counts, 
       color="#4C72B0", 
       edgecolor="black", 
       alpha=0.8)

# Set labels and title with padding for readability
ax.set_xlabel("Année", fontsize=12, labelpad=10)
ax.set_ylabel("Nombre d'occurences", fontsize=12, labelpad=10)
ax.set_title(f"Occurrences du prénom {prenom} par année ({annee_debut}-{annee_fin})", fontsize=14, pad=15)

# Rotate x-axis labels, improve spacing, and format grid
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()  # Adjust layout for tight fit

# Display the plot
plt.show()

4 Le prénom

prenoms_feminins = fichier_prenoms.sql(f"""
SELECT preusuel, 
       SUM(nombre) AS nombre_total
  FROM self
 WHERE preusuel != '_PRENOMS_RARES'
   AND annais != 'XXXX' 
   AND sexe = 2
   AND annais BETWEEN '1970' AND '2000'
 GROUP BY preusuel
 ORDER BY nombre_total DESC,
          preusuel DESC;
 """).collect()
(8363 - 3) / 5
prenoms_feminins.row(1672)
prenoms_feminins.with_row_index()[1670:1680]

5 Export

import s3fs

fs = s3fs.S3FileSystem(
    client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},
    key = os.environ["AWS_ACCESS_KEY_ID"], 
    secret = os.environ["AWS_SECRET_ACCESS_KEY"], 
    token = os.environ["AWS_SESSION_TOKEN"])

destination = f"s3://{s3_username}/ENSAI/SQL/output.csv"

with fs.open(destination, mode='wb') as f:
    top10f2021.write_csv(f)