def crible_eratosthene(n):
"""Génère une liste de booléens où True indique qu'un nombre est premier."""
= [True] * (n + 1)
est_premier 0] = est_premier[1] = False # 0 et 1 ne sont pas premiers
est_premier[
for i in range(2, int(n**0.5) + 1):
if est_premier[i]:
for multiple in range(i * i, n + 1, i):
= False
est_premier[multiple]
return est_premier
Nombres premiers
= crible_eratosthene(100000)
l
= 0
cpt for i in l[10000:]:
if i:
+= 1
cpt
print(cpt)
8363] l[
1 Fichier des communes
import pandas as pd
import duckdb
"""
duckdb.sql(CREATE OR REPLACE VIEW commune AS
FROM 'https://www.insee.fr/fr/statistiques/fichier/7766585/v_commune_2024.csv';
""")
"""
duckdb.sql(SELECT *
FROM commune
""").to_df()
"""
duckdb.sql(SELECT reg
FROM commune
WHERE NCC = 'MAMOUDZOU';
""").to_df()
"""
duckdb.sql(SELECT COUNT(1)
FROM commune
WHERE dep = '973';
""").to_df()
"""
duckdb.sql(SELECT COUNT(1)
FROM commune
WHERE dep IN ('02','60','80')
AND NCC LIKE '%L%L%L%L%';
""").to_df()
"""
duckdb.sql(SELECT dep,
COUNT(1) AS nb_communes,
RANK() OVER (ORDER BY COUNT(1) DESC) AS rang
FROM commune
WHERE dep IS NOT NULL
GROUP BY dep
HAVING nb_communes >= 500
ORDER BY 2 DESC;
""").to_df()
"""
duckdb.sql(SELECT dep
FROM commune
WHERE LENGTH(ncc) = (SELECT max(LENGTH(ncc))
FROM commune);
""").to_df()
2 Polars
import os
import polars as pl
= f'https://{os.environ["AWS_S3_ENDPOINT"]}'
s3_endpoint = os.environ["AWS_ACCESS_KEY_ID"]
s3_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]
s3_secret_access_key = os.environ["AWS_SESSION_TOKEN"]
s3_session_token = os.environ["AWS_DEFAULT_REGION"]
s3_region
= os.environ["VAULT_TOP_DIR"] # un peu bancal pour avoir le username s3_username
= f"s3://{s3_username}/ENSAI/SQL/tp3-longueur-fil.parquet"
s3_file_path
= {
storage_options "aws_endpoint": s3_endpoint,
"aws_access_key_id": s3_access_key,
"aws_secret_access_key": s3_secret_access_key,
"aws_token": s3_session_token,
"aws_region": s3_region
}
file = pl.scan_parquet(source=s3_file_path,
=storage_options) storage_options
= file.collect() df
print(df)
filter(pl.col("vetement") == "gilet") df.
3 Diffusion
= pl.scan_parquet(source="s3://ldeneuville-ensai/diffusion/ENSAI/SQL-TP/prenoms-nat2022.parquet",
fichier_prenoms =storage_options) storage_options
= fichier_prenoms\
top10f2021 filter((pl.col("sexe") == 2) &
."annais") == "2021") &
(pl.col("preusuel") != "_PRENOMS_RARES"))\
(pl.col("preusuel")\
.group_by("nombre").sum().alias("nombre_total"))\
.agg(pl.col("nombre_total", descending=True)\
.sort(10)\
.limit( .collect()
top10f2021
= fichier_prenoms\
top10f2021_sql """
.sql( SELECT preusuel,
SUM(nombre) AS nombre_total
FROM self
WHERE preusuel <> '_PRENOMS_RARES'
AND annais = '2021'
AND sexe = 2
GROUP BY preusuel
ORDER BY nombre_total DESC
LIMIT 10
""")\
.collect()
top10f2021_sql
= "Katia"
prenom = 1960, 1980
annee_debut, annee_fin
= fichier_prenoms\
df_prenom_annee filter((pl.col("preusuel") == prenom.upper()) &
."annais").str.to_integer(strict=False).is_not_null()) & # Convertir et exclure les nulls
(pl.col("annais").str.to_integer(strict=False) >= annee_debut) &
(pl.col("annais").str.to_integer(strict=False) <= annee_fin))\
(pl.col("annais")\
.group_by("nombre").sum().alias("nombre_total"))\
.agg(pl.col("annais")\
.sort(
.collect()
print(df_prenom_annee)
f"""
fichier_prenoms.sql(SELECT annais,
SUM(nombre) AS nombre_total
FROM self
WHERE preusuel = 'KATIA'
AND annais != 'XXXX'
AND annais BETWEEN '{annee_debut}' AND '{annee_fin}'
GROUP BY annais
ORDER BY annais;
""").collect()
import matplotlib.pyplot as plt
import seaborn as sns
# Set seaborn style for a cleaner, more appealing look
="whitegrid")
sns.set_theme(style
# Create the plot
= plt.subplots(figsize=(10, 6))
fig, ax
# Data
= df_prenom_annee["annais"].to_list() # Extracts years as a list
years = df_prenom_annee["nombre_total"].to_list() # Extracts counts as a list
counts
# Bar plot with custom colors and transparency for better visuals
ax.bar(years,
counts, ="#4C72B0",
color="black",
edgecolor=0.8)
alpha
# Set labels and title with padding for readability
"Année", fontsize=12, labelpad=10)
ax.set_xlabel("Nombre d'occurences", fontsize=12, labelpad=10)
ax.set_ylabel(f"Occurrences du prénom {prenom} par année ({annee_debut}-{annee_fin})", fontsize=14, pad=15)
ax.set_title(
# Rotate x-axis labels, improve spacing, and format grid
='x', rotation=45)
ax.tick_params(axis# Adjust layout for tight fit
plt.tight_layout()
# Display the plot
plt.show()
4 Le prénom
= fichier_prenoms.sql(f"""
prenoms_feminins SELECT preusuel,
SUM(nombre) AS nombre_total
FROM self
WHERE preusuel != '_PRENOMS_RARES'
AND annais != 'XXXX'
AND sexe = 2
AND annais BETWEEN '1970' AND '2000'
GROUP BY preusuel
ORDER BY nombre_total DESC,
preusuel DESC;
""").collect()
8363 - 3) / 5 (
1672) prenoms_feminins.row(
1670:1680] prenoms_feminins.with_row_index()[
5 Export
import s3fs
= s3fs.S3FileSystem(
fs ={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},
client_kwargs= os.environ["AWS_ACCESS_KEY_ID"],
key = os.environ["AWS_SECRET_ACCESS_KEY"],
secret = os.environ["AWS_SESSION_TOKEN"])
token
= f"s3://{s3_username}/ENSAI/SQL/output.csv"
destination
with fs.open(destination, mode='wb') as f:
top10f2021.write_csv(f)