From ed0af3b70784cc05c21d80716ed120628f341f9c Mon Sep 17 00:00:00 2001 From: Fabrice Gangler <fabrice.gangler@adullact.org> Date: Fri, 3 Jul 2020 11:12:01 +0200 Subject: [PATCH] poc v0.2 --- wikidata_200.php | 373 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 373 insertions(+) create mode 100644 wikidata_200.php diff --git a/wikidata_200.php b/wikidata_200.php new file mode 100644 index 0000000..6949e4d --- /dev/null +++ b/wikidata_200.php @@ -0,0 +1,373 @@ +<?php +function slugify($string, $delimiter = '-') { + $oldLocale = setlocale(LC_ALL, '0'); + setlocale(LC_ALL, 'en_US.UTF-8'); + $clean = iconv('UTF-8', 'ASCII//TRANSLIT', $string); + $clean = preg_replace("/[^a-zA-Z0-9\/_|+ -]/", '', $clean); + $clean = strtolower($clean); + $clean = preg_replace("/[\/_|+ -]+/", $delimiter, $clean); + $clean = trim($clean, $delimiter); + setlocale(LC_ALL, $oldLocale); + return $clean; +} + +$rgIds = [ + 'Q18678265', // 34 - Hérault +]; +foreach ($rgIds as $rgId) { + run($rgId); +} + +/** + * @param $rgId wikidata region ID + */ +function run($rgId) { + + $cacheDirectory = "./cache"; + //if(!is_dir($cacheDirectory)) { + // mkdir($cacheDirectory, 0777, true); + //} + + /////////////////////////// Region + $rgId = 'Q18678265'; // 34 - Hérault + $url = "https://www.wikidata.org/wiki/Special:EntityData/$rgId.json"; + $fileName = basename($url); + $rgCacheDirectory = "$cacheDirectory/regions" ; + $rgFilePath = "$rgCacheDirectory/" . basename($url); + if(!is_dir($rgCacheDirectory)) { + mkdir($rgCacheDirectory, 0777, true); + } + if (!is_file($rgFilePath)){ + $contents = file_get_contents($url); + file_put_contents($rgFilePath, $contents); + } + $cacheData = file_get_contents($rgFilePath); + $results = json_decode($cacheData); + $rg = $results->entities->$rgId; + $rgModified = $rg->modified; + $rgIdProperty = $rg->id; + $rgLinkWikidata = "https://www.wikidata.org/wiki/$rgIdProperty"; + $rgLinkWikidataJson = "https://www.wikidata.org/wiki/Special:EntityData/$rgIdProperty.json"; + $rgName = $rg->labels->fr->value; + /////////////////////////////////////// + $rgDescriptionI18nFr = $rg->descriptions->fr->value; + $rgDescriptionI18nEn = $rg->descriptions->en->value; + /////////////////////////////////////// + $rgLinkCommonswiki = ''; + if(isset($rg->sitelinks->commonswiki->url)) { + $rgLinkCommonswiki = $rg->sitelinks->commonswiki->url; + } + $rgLinkWikipediaI18nFr = $rg->sitelinks->frwiki->url; + $rgLinkWikipediaI18nEn = $rg->sitelinks->enwiki->url; + /////////////////////////////////////// + $rgExtraData = $rg->claims; + $rgInseeId = $rgExtraData->P2585[0]->mainsnak->datavalue->value; + $rgInseeUrlStat = "https://www.insee.fr/fr/statistiques?geo=REG-$rgInseeId"; + $rgInseeUrlGeo = "https://www.insee.fr/fr/metadonnees/cog/region/REG$rgInseeId-".slugify($rgName); + $rgOsmId = $rgExtraData->P402[0]->mainsnak->datavalue->value; + $rgOsmUrl = "https://www.openstreetmap.org/relation/$rgOsmId"; + /////////////////////////////////////// + $rgLinkTwitter = ''; + if(isset($rgExtraData->P2002[0]->mainsnak->datavalue->value)) { + $rgLinkTwitter = $rgExtraData->P2002[0]->mainsnak->datavalue->value; + } + $rgDataGouvId = ''; + $rgDataGouvUrl = ''; + if(isset($rgExtraData->P3206[0]->mainsnak->datavalue->value)) { + $rgDataGouvId = $rgExtraData->P3206[0]->mainsnak->datavalue->value; + $rgDataGouvUrl = "https://www.data.gouv.fr/organizations/$rgDataGouvId/"; + } + + + // Population @@@TODO extract date + $rgPopulationData = $rgExtraData->P1082; + $rgOutputPopulation = ''; + foreach ($rgPopulationData as $key => $populationData) { + $populationValue = $populationData->mainsnak->datavalue->value->amount; + $populationValue = str_replace('+', '', $populationValue); + $populationValue = number_format($populationValue, 0, ',', ' '); + $rgOutputPopulation .= "$key - $populationValue personnes ----> @@@TODO extract date\n"; + } + $rgPopulationData = $rgExtraData->P1082; + + + /////////////////////////////////////// + echo "\n..... Région $rgName ........................................................................\n\n"; + echo "Cache: $rgFilePath \n"; + echo "Wikidata update: $rgModified \n"; + echo "Wikidata ID: $rgIdProperty \n"; + echo "Wikidata Name: $rgName \n"; + echo "INSEE Code: $rgInseeId \n"; + echo "DataGouv ID: $rgDataGouvId \n"; + echo "Wikidata Type FR : $rgDescriptionI18nFr \n"; + echo "Wikidata Type EN : $rgDescriptionI18nEn \n"; + echo "URL Twitter : https://twitter.com/$rgLinkTwitter \n"; + echo "URL wikipedia FR : ${rgLinkWikipediaI18nFr} \n"; + echo "URL wikipedia EN : ${rgLinkWikipediaI18nEn} \n"; + echo "URL wikimedia commons : ${rgLinkCommonswiki} \n"; + echo "URL WikiData : ${rgLinkWikidata} \n"; + echo "URL WikiData Json : $rgLinkWikidataJson \n"; + echo "URL Data Gouv : $rgDataGouvUrl \n"; + echo "URL INSEE : $rgInseeUrlGeo <--- @@@TODO slug \n"; + echo "URL INSEE statistiques : $rgInseeUrlStat \n"; + echo "URL Open Street Map : ${rgOsmUrl} \n"; + echo "Open Street Map ID: $rgOsmId \n"; + echo "\n"; + echo "--- Population ----------------------- \n"; + echo $rgOutputPopulation; + + + $rgSubEntities = $rgExtraData->P150; + $rgNbOfSubEntities = count($rgSubEntities); + $rgNbOfValidSubEntities = 0; + $rgSubEntitiesOutput = ''; + foreach ($rgSubEntities as $key => $rgSubEntity) { + /////////// DEPARTEMENTS DE LA REGION /////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////////////// + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + $dptId = 'Q3083'; // 01 - Ain + $dptId = 'Q12545'; // 34 - Hérault + $dptId = $rgSubEntity->mainsnak->datavalue->value->id; + $url = "https://www.wikidata.org/wiki/Special:EntityData/$dptId.json"; + $fileName = basename($url); + $dptCacheDirectory = "$cacheDirectory/departements" ; + $dptFilePath = "$dptCacheDirectory/" . basename($url); + if(!is_dir($dptCacheDirectory)) { + mkdir($dptCacheDirectory, 0777, true); + } + + + if (!is_file($dptFilePath)){ + $contents = file_get_contents($url); + file_put_contents($dptFilePath, $contents); + } + + $cacheData = file_get_contents($dptFilePath); + $results = json_decode($cacheData); + $dpt = $results->entities->$dptId; + $dptModified = $dpt->modified; + $dptIdProperty = $dpt->id; + $dptLinkWikidata = "https://www.wikidata.org/wiki/$dptIdProperty"; + $dptLinkWikidataJson = "https://www.wikidata.org/wiki/Special:EntityData/$dptIdProperty.json"; + $dptName = $dpt->labels->fr->value; + /////////////////////////////////////// + $dptDescriptionI18nFr = $dpt->descriptions->fr->value; + $dptDescriptionI18nEn = $dpt->descriptions->en->value; + /////////////////////////////////////// + $dptCode = $dpt->aliases->fr[0]->value; + $dptFullName = $dptName; + if(isset($dpt->aliases->fr[1])) { + $dptFullName = $dpt->aliases->fr[1]->value; + } + /////////////////////////////////////// + $dptLinkCommonswiki = ''; + if(isset($dpt->sitelinks->commonswiki->url)) { + $dptLinkCommonswiki = $dpt->sitelinks->commonswiki->url; + } + $dptLinkWikipediaI18nFr = $dpt->sitelinks->frwiki->url; + $dptLinkWikipediaI18nEn = $dpt->sitelinks->enwiki->url; + /////////////////////////////////////// + $dptExtraData = $dpt->claims; + $dptInseeId = $dptExtraData->P2586[0]->mainsnak->datavalue->value; + $dptInseeUrlStat = "https://www.insee.fr/fr/statistiques?geo=DEP-$dptInseeId"; + $dptInseeUrlGeo = "https://www.insee.fr/fr/metadonnees/cog/departement/DEP$dptInseeId-".slugify($dptName); + $dptOsmId = $dptExtraData->P402[0]->mainsnak->datavalue->value; + $dptOsmUrl = "https://www.openstreetmap.org/relation/$dptOsmId"; + + // Population @@@TODO extract date + $dptPopulationData = $dptExtraData->P1082; + $dptOutputPopulation = ''; + foreach ($dptPopulationData as $key => $populationData) { + $populationValue = $populationData->mainsnak->datavalue->value->amount; + $populationValue = str_replace('+', '', $populationValue); + $populationValue = number_format($populationValue, 0, ',', ' '); + $dptOutputPopulation .= "$key - $populationValue personnes ----> @@@TODO extract date\n"; + } + + + // print_r($dptExtraData->P150); + + + + + // + $dptSubEntities = $dptExtraData->P150; + $dptNbOfSubEntities = count($dptSubEntities); + $dptNbOfValidSubEntities = 0; + $dptNbOfValidSubEntitiesWithWebsite = 0; + $dptSubEntitiesOutput = ''; + foreach ($dptSubEntities as $key => $subEntity) { + $subEntityId = $subEntity->mainsnak->datavalue->value->id; + $subEntityCacheDirectoryCommon = "$cacheDirectory/communes" ; + if(!is_dir($subEntityCacheDirectoryCommon )) { + mkdir( $subEntityCacheDirectoryCommon, 0777, true); + } + $subEntityCacheDirectory = "$dptCacheDirectory/$dptId"; + if(!is_dir($subEntityCacheDirectory)) { + mkdir($subEntityCacheDirectory, 0777, true); + } + $url = "https://www.wikidata.org/wiki/Special:EntityData/$subEntityId.json"; + $fileName = basename($url); + $filePath = "$subEntityCacheDirectory/" . basename($url); + $filePathBackup = "$subEntityCacheDirectoryCommon/" . basename($url); + if (!is_file($filePath)){ + $contents = file_get_contents($url); + file_put_contents($filePath, $contents); + file_put_contents($filePathBackup, $contents); + } + $cacheData = file_get_contents($filePath); + $results = json_decode($cacheData); + + if(is_null($results)) { + $contents = file_get_contents($url); + file_put_contents($filePath, $contents); + $results = json_decode($cacheData); + } + + + $entity = $results->entities->$subEntityId; + $entityExtraData = $entity->claims; + $entityType = $entityExtraData->P31[0]->mainsnak->datavalue->value->id; + $allowedType = "Q484170"; + + $delegatedEntity = false; + if (isset($entityExtraData->P31[0]->qualifiers->P582)) { + $delegatedEntity = true; + } + if ( $entityType === $allowedType && $delegatedEntity === false) { + + $entityModified = $entity->modified; + $entityIdProperty = $entity->id; + $entityName = $entity->labels->fr->value; + /////////////////////////////////////// + $entityDescriptionI18nFr = $entity->descriptions->fr->value; + $entityDescriptionI18nEn = $entity->descriptions->en->value; + /////////////////////////////////////// + // $entityFullName = $entity->aliases->fr[1]->value; + /////////////////////////////////////// + $linkCommonswiki = ''; + if(isset($entity->sitelinks->commonswiki->url)) { + $linkCommonswiki = $entity->sitelinks->commonswiki->url; + } + $linkWikipediaI18nFr = $entity->sitelinks->frwiki->url; + $linkWikipediaI18nEn = $entity->sitelinks->enwiki->url; + /////////////////////////////////////// + $linkWikidata = "https://www.wikidata.org/wiki/$entityIdProperty"; + $linkWikidataJson = "https://www.wikidata.org/wiki/Special:EntityData/$entityIdProperty.json"; + /////////////////////////////////////// + $entityInseeId = ''; + $entityInseeUrlStat1 = ''; + $entityInseeUrlStat2 = ''; + $entityInseeUrlGeo = ''; + if(isset($entityExtraData->P374)) { + $entityInseeId = $entityExtraData->P374[0]->mainsnak->datavalue->value; + $entityInseeUrlStat1 = "https://www.insee.fr/fr/statistiques/2011101?geo=COM-$entityInseeId"; + $entityInseeUrlStat2 = "https://www.insee.fr/fr/statistiques?geo=COM-$entityInseeId"; + $entityInseeUrlGeo = "https://www.insee.fr/fr/metadonnees/cog/communes/COM$entityInseeId-".slugify($entityName); + } + + $entityOsmId = ''; + $entityOsmUrl = ''; + if(isset($entityExtraData->P402)) { + $entityOsmId = $entityExtraData->P402[0]->mainsnak->datavalue->value; + $entityOsmUrl = "https://www.openstreetmap.org/relation/$entityOsmId"; + } + /////////////////////////////////////// + $entityPostalCode = $entityExtraData->P281[0]->mainsnak->datavalue->value; + $entityWebsite = ''; + if(isset($entityExtraData->P856)) { + $dptNbOfValidSubEntitiesWithWebsite++; + $entityWebsite = $entityExtraData->P856[0]->mainsnak->datavalue->value; + } + + $entityPublicServiceDirectoryId = ''; + $entityPublicServiceDirectoryUrl = ''; + if(isset($entityExtraData->P6671)) { + $entityPublicServiceDirectoryId = $entityExtraData->P6671[0]->mainsnak->datavalue->value; + $entityPublicServiceDirectoryUrl = "https://lannuaire.service-public.fr/$entityPublicServiceDirectoryId"; + } + + + // print_r($entityExtraData->P1082);exit(); + // Population @@@TODO extract date + $entityPopulationData = $entityExtraData->P1082; + $outputPopulation = ''; + foreach ($entityPopulationData as $key => $populationData) { + $populationValue = $populationData->mainsnak->datavalue->value->amount; + $populationValue = str_replace('+', '', $populationValue); + $populationValue = number_format($populationValue, 0, ',', ' '); + $outputPopulation .= "$key - $populationValue personnes ----> @@@TODO extract date\n"; + } + + + + + $dptSubEntitiesOutput .= "\n..... $entityInseeId - $entityName ........................................................................\n\n"; + $dptSubEntitiesOutput .= "Cache: $filePath \n"; + $dptSubEntitiesOutput .= "Wikidata update: $entityModified \n"; + $dptSubEntitiesOutput .= "Wikidata ID: $subEntityId / $entityIdProperty \n"; + $dptSubEntitiesOutput .= "Wikidata Name: $entityName \n"; + $dptSubEntitiesOutput .= "INSEE Code: $entityInseeId \n"; + $dptSubEntitiesOutput .= "Postal Code: $entityPostalCode \n"; + $dptSubEntitiesOutput .= "Wikidata Type FR : $entityDescriptionI18nFr \n"; + $dptSubEntitiesOutput .= "Wikidata Type EN : $entityDescriptionI18nEn \n"; + $dptSubEntitiesOutput .= "URL website : $entityWebsite \n"; + $dptSubEntitiesOutput .= "URL wikipedia FR : ${linkWikipediaI18nFr} \n"; + $dptSubEntitiesOutput .= "URL wikipedia EN : ${linkWikipediaI18nEn} \n"; + $dptSubEntitiesOutput .= "URL wikimedia commons : ${linkCommonswiki} \n"; + $dptSubEntitiesOutput .= "URL WikiData : ${linkWikidata} \n"; + $dptSubEntitiesOutput .= "URL WikiData Json : $linkWikidataJson \n"; + $dptSubEntitiesOutput .= "URL INSEE : ${entityInseeUrlGeo} \n"; + $dptSubEntitiesOutput .= "URL INSEE stat. 1 : ${entityInseeUrlStat1} \n"; + $dptSubEntitiesOutput .= "URL INSEE stat. 2 : ${entityInseeUrlStat2} \n"; + $dptSubEntitiesOutput .= "URL Public directory : $entityPublicServiceDirectoryUrl \n"; + $dptSubEntitiesOutput .= "URL Open Street Map : ${entityOsmUrl} \n"; + $dptSubEntitiesOutput .= "Open Street Map ID: $entityOsmId \n"; + $dptSubEntitiesOutput .= "\n"; + $dptSubEntitiesOutput .= "--- Population ----------------------- \n"; + $dptSubEntitiesOutput .= $outputPopulation; + $dptSubEntitiesOutput .= "\n"; + + + + $dptNbOfValidSubEntities++; + } + } + + echo "\n..... $dptFullName ........................................................................\n\n"; + echo "Cache: $dptFilePath \n"; + echo "Wikidata update: $dptModified \n"; + echo "Wikidata ID: $dptIdProperty \n"; + echo "Wikidata Name: $dptName \n"; + echo "Wikidata Full Name: $dptFullName \n"; + echo "Wikidata Dpt Code: $dptCode \n"; + echo "INSEE Dpt Code: $dptInseeId \n"; + echo "Wikidata Type FR : $dptDescriptionI18nFr \n"; + echo "Wikidata Type EN : $dptDescriptionI18nEn \n"; + echo "URL wikipedia FR : ${dptLinkWikipediaI18nFr} \n"; + echo "URL wikipedia EN : ${dptLinkWikipediaI18nEn} \n"; + echo "URL wikimedia commons : ${dptLinkCommonswiki} \n"; + echo "URL WikiData : ${dptLinkWikidata} \n"; + echo "URL WikiData Json : $dptLinkWikidataJson \n"; + echo "URL INSEE : $dptInseeUrlGeo <--- @@@TODO slug \n"; + echo "URL INSEE statistiques : $dptInseeUrlStat \n"; + echo "URL Open Street Map : ${dptOsmUrl} \n"; + echo "Open Street Map ID: $dptOsmId \n"; + echo "\n"; + echo "--- Population ----------------------- \n"; + echo $dptOutputPopulation; + echo "\n"; + echo "----- $dptNbOfValidSubEntities communes (vs $dptNbOfSubEntities subdivisions administratives) ------------------------ \n"; + echo "----- dont $dptNbOfValidSubEntitiesWithWebsite communes avec un site web ------------------------ \n"; + echo "\n"; + echo $dptSubEntitiesOutput; + + /////////// FIN ----> DEPARTEMENTS DE LA REGION /////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////////////////////////// + } +} + + -- GitLab