Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision

Target

Select target project
  • dinum/noms-de-domaine-organismes-secteur-public
  • mdk/noms-de-domaine-organismes-secteur-public
  • gmonserand/noms-de-domaine-organismes-secteur-public
  • sneko/noms-de-domaine-organismes-secteur-public
4 results
Select Git revision
Show changes
Commits on Source (451)
......@@ -29,8 +29,9 @@ check:
rules:
- if: $CI_PIPELINE_SOURCE != "schedule"
script:
- python3 -m pip install validators
- python3 scripts/check.py
- python3 -m venv .venv
- .venv/bin/python -m pip install validators
- .venv/bin/python scripts/check.py
refresh:
# This pipeline is triggered by a schedule job at:
......@@ -42,40 +43,35 @@ refresh:
# - CI_PIPELINE_SOURCE to schedule
# - SSH_PRIVATE_KEY to an accepted private key
stage: refresh
# We're cloning ourself for multiple reasons:
# - To avoid permission issues (gitlab would clone as root)
# - To be on a branch (gitlab would land us in detached head)
# - To simplify configuration (pushurl would be HTTP, we need SSH)
variables:
GIT_STRATEGY: none
rules:
- if: $CI_PIPELINE_SOURCE == "schedule"
before_script:
# Git refuses to read `.git/` owned by other users unless explicitly
# marked as safe, that's the case here:
# - GitLab clones the repository as `root`.
# - The Docker image we use is logging us as the user `runner`.
# See CVE-2022-24765 for more information.
- git config --global --add safe.directory "$(pwd)"
# If we need to commit, we'll need a username and an email:
- git config --global user.name "${GITLAB_USER_NAME}"
- git config --global user.email "${GITLAB_USER_EMAIL}"
# Tell git we're pushing over SSH, not over HTTP.
- git remote set-url origin --push
"$(git remote get-url origin | sed 's~https://.*@\([^/]\+\)/~git@\1:~g')"
- git checkout "$CI_COMMIT_REF_NAME"
- git reset --hard "$CI_COMMIT_SHA"
# Setup of private ssh key in case we need to push:
- mkdir -p ~/.ssh/
- chmod 700 ~/.ssh
- echo "$SSH_PRIVATE_KEY" | tr -d '\r' > ~/.ssh/id_ed25519
- chmod 600 ~/.ssh/id_ed25519
# Just to see the user, email, and pushurl used:
- cat ~/.gitconfig
- cat .git/config
# Cloning the repo ourself
- git clone --depth 1 git@gitlab.adullact.net:dinum/noms-de-domaine-organismes-secteur-public.git
- cd noms-de-domaine-organismes-secteur-public
# And while we're at showing debug information:
- python3 --version
script:
- python3 -m pip install aiohttp tqdm
- python3 -m venv .venv
- .venv/bin/python -m pip install aiohttp tqdm
# Each day we're checking 1/28 of the domains we know.
# This ensures us that all domains are tested every month.
# For simplicity, we just don't test on days that may not exist
......@@ -84,14 +80,14 @@ refresh:
- if [[ $(date +%d) -le 28 ]];
then
echo "Check some domains over HTTP...";
python3 scripts/http_checker.py --silent --partial $(date +%d)/28;
.venv/bin/python scripts/http_checker.py --silent --partial $(date +%d)/28;
echo "What changed:";
git diff --stat urls.txt;
CHANGED="$(git diff urls.txt | grep ^-http | cut -d/ -f3)";
if [[ $CHANGED ]]; then
echo "Re-check domains that we just considered down";
echo "just in case they're up now...";
python3 scripts/http_checker.py
.venv/bin/python scripts/http_checker.py
--silent
--slow
--partial $(date +%d)/28
......
......@@ -99,6 +99,9 @@ L’association [ADULLACT](https://adullact.org/) souhaite établir des statisti
DashLord est né à la [Fabrique des ministères sociaux](https://fabrique.social.gouv.fr/) pour répondre aux besoins d’évaluation et de mise en œuvre des bonnes pratiques de développement web.
### [Établi](https://etabli.incubateur.net/) : un annuaire des initiatives publiques numériques
Service qui référence les initiatives publiques numériques françaises, ce afin d'augmenter leur découvrabilité et leur (ré)utilisation. Il a été réalisé au sein de l'équipe [beta.gouv.fr](https://beta.gouv.fr/).
# Licence
......
This diff is collapsed.
......@@ -24,15 +24,19 @@ def err(*args, **kwargs):
print(*args, **kwargs)
def check_is_sorted(file, lines):
domains = [Domain.from_file_line(file, line) for line in lines]
if domains != sorted(domains):
err(f"{file}: Is not sorted, run `python scripts/sort.py domains.csv`")
def warn(*args, **kwargs):
kwargs["file"] = sys.stderr
print(*args, **kwargs)
def check_is_valid_domain(file, lineno, line):
if not validators.domain(line):
if not validators.domain(line, rfc_2782=True):
err(f"{file}:{lineno}: {line!r} does not looks like a domain name.")
if not validators.domain(line):
warn(
f"{file}:{lineno}: {line!r} cannot be used in an URL,",
"it's either an DNS SRV record or a typo.",
)
def check_lowercased(file, lineno, line):
......@@ -45,6 +49,21 @@ def check_is_public_domain(file, lineno, line):
err(f"{file}:{lineno}: {line!r} is not a public domain.")
class SortedChecker:
def __init__(self):
self.previous = None
self.has_errored = False
def __call__(self, file, lineno, domain):
if self.has_errored:
return # Don't flood
if self.previous is not None:
if self.previous > Domain.from_file_line(file, domain):
err(f"{file}: Is not sorted, run `python scripts/sort.py domains.csv`")
self.has_errored = True
self.previous = Domain.from_file_line(file, domain)
class DuplicateChecker:
def __init__(self):
self.seen = {}
......@@ -88,16 +107,21 @@ class DuplicateChecker:
def main():
check_duplicate_line = DuplicateChecker()
checkers = [
check_duplicate_line,
check_is_valid_domain,
check_is_public_domain,
check_lowercased,
SortedChecker(),
]
with open("domains.csv", encoding="UTF-8") as domainsfile:
domainsreader = csv.reader(domainsfile)
next(domainsreader) # Skip header
lines = [row[0] for row in domainsreader]
check_is_sorted("domains.csv", lines)
for lineno, line in enumerate(lines, start=2):
check_is_valid_domain("domains.csv", lineno, line)
check_is_public_domain("domains.csv", lineno, line)
check_duplicate_line("domains.csv", lineno, line)
check_lowercased("domains.csv", lineno, line)
for line in domainsreader:
for checker in checkers:
checker("domains.csv", domainsreader.line_num, line[0])
for domain in parse_files(Path("urls.txt")) - check_duplicate_line.all_domains:
err(f"urls.txt: {domain} not found in domains.csv.")
......
......@@ -79,6 +79,7 @@ NON_PUBLIC_DOMAINS = {
"sioracderiberac.com",
"varchetta.fr", # squatte www.commune-la-chapelle-de-brain.fr
"viteundevis.com", # squatte mairiemarignaclasclares.fr
"vitry-sur-orne.com", # domaine squatté
"voxaly.com",
"wewmanager.com",
}
......@@ -17,9 +17,9 @@ So the next maintainer can import using:
import argparse
from pathlib import Path
import psycopg2
import psycopg
from public_domain import Domain, NON_PUBLIC_DOMAINS, parse_csv_file, write_csv_file
from public_domain import NON_PUBLIC_DOMAINS, Domain, parse_csv_file, write_csv_file
ROOT = Path(__file__).resolve().parent.parent
FILE = ROOT / "domains.csv"
......@@ -27,19 +27,22 @@ FILE = ROOT / "domains.csv"
def query_ct_logs(last_id):
"""Query crt.sh using their postgres public API."""
conn = psycopg2.connect(dbname="certwatch", user="guest", host="crt.sh")
conn.set_session(readonly=True, autocommit=True)
cur = conn.cursor()
cur.execute(
"""SELECT id, altnames.*, x509_subjectname(certificate) subject
FROM certificate, LATERAL (SELECT * FROM x509_altnames(certificate)) altnames
WHERE plainto_tsquery('gouv.fr') @@ identities(certificate) AND id > %s""",
(last_id,),
)
with psycopg.connect(dbname="certwatch", user="guest", host="crt.sh", autocommit=True) as conn:
conn.read_only = True
with conn.cursor() as cur:
cur.execute(
"""SELECT id, altnames.*, x509_subjectname(certificate) subject
FROM certificate, LATERAL (
SELECT * FROM x509_altnames(certificate)
) altnames
WHERE plainto_tsquery('gouv.fr') @@ identities(certificate)
AND id > %s""",
(last_id,),
)
results = cur.fetchall()
domains = parse_csv_file(FILE)
primary_key = None
results = cur.fetchall()
for primary_key, domain, subject in results:
domain = Domain(
domain.lower(),
......
tqdm
aiohttp
validators
psycopg2
validators>=0.24.0 # where they introduced rfc_2782.
psycopg[binary]
requests
This diff is collapsed.