Обучаю битриксу программистов, интеграторов. Подробнee ⇒

Парсеры



давно писала парсеры супермаркетов, наверняка уже не работают, но пусть будут тут для будущих проектов
<?define("STOP_STATISTICS", true);
require_once($_SERVER["DOCUMENT_ROOT"]."/bitrix/modules/main/include/prolog_before.php");

global $USER, $APPLICATION, $CACHE_MANAGER;
if ( 
   !$USER->IsAdmin() ||   
   !CModule::IncludeModule("iblock") || 
   !check_bitrix_sessid() || 
   $_SERVER["REQUEST_METHOD"] != "POST" || 
   !isset($_REQUEST["action"])
){
    return;
}

function ParseAv($url, $bPagerView = false){
   $url = 'https://av.ru'.$url;   
   // sleep(rand(2,6));
   $html = file_get_contents($url);
   $dom = new DOMDocument; 
   $arProducts = array();
   $arLinks = array();
   $arResult = array();

   $dom->loadHTML($html);
   $items = $dom->getElementsByTagName('div'); 
   foreach ($items as $item) {     
      //echo $item->getAttribute('class').'<br>';
      $arProduct = array();
      if($item->getAttribute('class') == "b-product js-product "){         
         $arProduct["CODE"] = $item->getAttribute('data-code');
         $arProduct["WEIGHTED"] = $item->getAttribute('data-weighted');
         $arProduct["MEASURE"] = $item->getAttribute('data-measure');
         $arProduct["PRICE"] = $item->getAttribute('data-unit-price');
         //$arProduct["OLD_PRICE"] = $item->getAttribute('data-old-price');

         $itemLinks = $item->getElementsByTagName('a');
         foreach($itemLinks as $itemLink) {
            if($itemLink->getAttribute('class') == "b-product__title"){            
               $arProduct["NAME"] = $itemLink->nodeValue;
               $arProduct["LINK"] = $itemLink->getAttribute('href');            
            }            
         }
         $itemImages = $item->getElementsByTagName('img');
         foreach($itemImages as $itemImage) {
            if($itemImage->getAttribute('class') == "b-product__photo"){
               $arProduct["IMAGE"] = $itemImage->getAttribute('src');    
            }
         }
         if($arProduct["PRICE"]!=""){      
            $arProducts[] = $arProduct;
         }
         //$arProducts[] = $arProduct;
      }
      elseif($bPagerView && ($item->getAttribute('class') == "b-pager__list")){
         $itemLinks = $item->getElementsByTagName('a');
         foreach ($itemLinks as $itemLink) {
            $arLinks[] = $itemLink->getAttribute('href');            
         }
      }
   }
   $arResult = array ("PRODUCTS" => $arProducts, "LINKS" => $arLinks);   
   return $arResult;
}

function ParseUtka($url, $bPagerView = false){
   // sleep(rand(2,6));
   $html = file_get_contents('https://www.utkonos.ru'.$url);      
   $dom = new DOMDocument; 
   $arProducts = array();
   $arLinks = array();
   $arResult = array();

   $dom->loadHTML($html);
   $items = $dom->getElementsByTagName('div'); 
   foreach ($items as $item) {          
      $arProduct = array();
      if($item->getAttribute('class') == "goods_view_box-view goods_view goods_view-item"){   
         $arProduct["CODE"] = $item->getAttribute('data-item_id');
         $itemLinks = $item->getElementsByTagName('a');
         foreach($itemLinks as $itemLink) {
            if($itemLink->getAttribute('class') == "pic_pic"){            
               $itemImages = $itemLink->getElementsByTagName('img');
               foreach($itemImages as $itemImage) {                     
                  $arProduct["IMAGE"] = $itemImage->getAttribute('src');                      
               }         
            }            
         }
         $itemInputs = $item->getElementsByTagName('input');
         foreach($itemInputs as $itemInput) {
            if($itemInput->getAttribute('name') == "log_json"){
               $arData = (array) json_decode($itemInput->getAttribute('value')); 
               $arProduct["PRICE"] = $arData["price"];
               $arProduct["NAME"] = $arData["name"];
               $arProduct["AMOUNT"] = $arData["count"];
               $arProduct["LINK"] = '/item/'.$arData["category_id"].'/'.$arProduct["CODE"];
               $arProduct["DATA"] = $itemInput->getAttribute('value');
            }            
         }      
         if($arProduct["NAME"]!=""){      
            $arProducts[] = $arProduct;
         }
      }
      elseif(($bPagerView===true) && ($item->getAttribute('class') == "signature")){
         $s = $item->nodeValue;
         $s1 = str_replace('Страница: ', '', $s);
         if($s1 != $s){
            $str = explode(" ", $s1);
            $s1 = $str[(count($str)-1)];
            for ($i=2; $i <= $s1 ; $i++) { 
               $arLinks[] = $url.'/page/'.$i;
            }
         }
      }   
   }
   $arResult = array ("PRODUCTS" => $arProducts, "LINKS" => $arLinks);
   return $arResult;
}

function ParseOkeyMsk($url, $bPagerView = false){
   // sleep(rand(2,6));   
   $userAgent = 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36';
      $userCookieFile = $_SERVER["DOCUMENT_ROOT"]."/cookies.txt";   
      $ch = curl_init();
   curl_setopt($ch, CURLOPT_URL, "https://www.okeydostavka.ru".$url."&resultsPerPage=72");
   curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
   curl_setopt($ch, CURLOPT_HTTPHEADER, array(        
        "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept_Encoding: gzip, deflate, sdch, br",
        "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4",
        "Cache-Control: max-age=0",
        // "Referer: https://www.okeydostavka.ru/msk",        
        "Connection: keep-alive",
        "Host: www.okeydostavka.ru",        
        "Upgrade-Insecure-Requests:1",
        "User-Agent: ".$userAgent
        )
    );      
    curl_setopt($ch, CURLOPT_COOKIEFILE, $userCookieFile);
   // $bufer = ob_get_contents();
   $html = curl_exec($ch);
   
   //ob_clean();  echo $bufer;

   $dom = new DOMDocument; 
   $arProducts = array();
   $arLinks = array();
   $arResult = array();

   $dom->loadHTML('<?xml encoding="UTF-8">' .$html);
   $items = $dom->getElementsByTagName('div'); 
   foreach ($items as $item) {          
      $arProduct = array();
      if($item->getAttribute('class') == "product ok-theme"){         
         $items2 = $item->getElementsByTagName('div');
         foreach($items2 as $item2) {
            if($item2->getAttribute('class') == "image"){            
               $itemLinks = $item2->getElementsByTagName('a');
               foreach($itemLinks as $itemLink) {                     
                  $arProduct["NAME"] = $itemLink->getAttribute('title');                  
                  $arProduct["LINK"] = $itemLink->getAttribute('href');                  
               }
               $itemImages = $item2->getElementsByTagName('img');
               foreach($itemImages as $itemImage) {
                  $arProduct["IMAGE"] = $itemImage->getAttribute('src');   
               }         
            }
            elseif ($item2->getAttribute('class') == "product_weight") {
               $arProduct["WEIGHTED"] = $item2->nodeValue;
               $arProduct["WEIGHTED"] = ReplaceBadSymb($arProduct["WEIGHTED"]);
               
               $arProduct["WEIGHTED"] = explode(" ", $arProduct["WEIGHTED"]);
               $arProduct["WEIGHTED"] = str_replace(array(","," "), array(".",""), $arProduct["WEIGHTED"][0]);
               $itemSpans = $item2->getElementsByTagName('span');
               foreach($itemSpans as $itemSpan) {
                  $arProduct["MEASURE"] = ReplaceBadSymb($itemSpan->nodeValue);
               }
            }                     
         }
         $itemSpans = $item->getElementsByTagName('span');
         foreach($itemSpans as $itemSpan) {
            if($itemSpan->getAttribute('itemprop') == "price"){         
               $arProduct["PRICE"] = $itemSpan->nodeValue;
               $arProduct["PRICE"] = ReplaceBadSymb($arProduct["PRICE"]);               
               $arProduct["PRICE"] = explode(" ", $arProduct["PRICE"]);
               $arProduct["PRICE"] = str_replace(array(","," "), array(".",""), $arProduct["PRICE"][0]);         
               $arProduct["CODE"] = $itemSpan->getAttribute('id');
               $arProduct["CODE"] = explode("_", $arProduct["CODE"]);
               $arProduct["CODE"] = $arProduct["CODE"][1];   
               $arProduct["CODE"] = "okeymsk_".$arProduct["CODE"];      
            }            
         }
         
         if($arProduct["NAME"] != ""){      
            $arProducts[] = $arProduct;
         }
      }
      elseif(($bPagerView===true) && ($item->getAttribute('class') == "pages pageControlMenu")){
         $arLinks = array($url."&orderBy=4");
      }   
   }
   //if(count($arProducts)==0){
      AddMessage2Log($html);   
   //}
   $arResult = array ("PRODUCTS" => $arProducts, "LINKS" => $arLinks);
   return $arResult;
}

function AddNewPrice($id, $code, $datePrice, $price){
   global $DB;   
   if(
      $id != "" &&
      $code != "" &&
      $datePrice != "" &&
      $price != ""
   ){
      $arFields = array(            
           "ELEMENT_ID"      => "'".$id."'",
           "ELEMENT_CODE"    => "'".$code."'",
           "DATE_PRICE"      => "'".$datePrice."'",
           "PRICE"           => "'".$price."'",           
       );
       $DB->StartTransaction();
       $DB->Insert("a_prices", $arFields, $err_mess.__LINE__);          
      if (strlen($strError)<=0){
          $DB->Commit();           
      }
      else $DB->Rollback();
   }
}

function AddProducts($arProducts,$sectionId){
   global $DB, $USER;
   $el = new CIBlockElement;
   $DB->PrepareFields("a_prices");
   $today = date("Y-m-d");
   $arResult = array();
   $arCodes = array();
   $arProductsTemp = array();
   //собираем коды товаров
   foreach ($arProducts as $key => $arProduct){
      if($arProduct["CODE"] != ""){
         $arCodes[] = $arProduct["CODE"];
         $arProductsTemp[$arProduct["CODE"]] = $arProduct;
      }
   }
   //$arCodes = array_unique($arCodes);
   //$arResult ["CODES"] = $arCodes;
   //выбираем товары с найденными кодами
   if(count($arCodes)>0){
      $arSelect = Array("ID", "NAME", "IBLOCK_ID","IBLOCK_SECTION_ID","PROPERTY_PRICE","PROPERTY_CODE");
      $arFilter = Array(
         "IBLOCK_ID"=>"3",    
         "PROPERTY_CODE" => $arCodes   
      );   
      $res = CIBlockElement::GetList(Array("sort"=>"desc"), $arFilter, false, Array("nPageSize"=>1999), $arSelect);
      while($ob = $res->GetNextElement()){
         $arFieldsTemp = $ob->GetFields();
         $productCode = $arFieldsTemp["PROPERTY_CODE_VALUE"];
         $oldPrice = $arFieldsTemp["PROPERTY_PRICE_VALUE"];
         $newPrice = $arProductsTemp[$productCode]["PRICE"];
         $arResult["UPD"][] = array("ID" =>$arFieldsTemp["ID"], "OLD"=>$oldPrice,"NEW"=>$newPrice, "CODE" => $productCode);
         if($oldPrice != $newPrice){
            //апдейтим новую цену у товара
            CIBlockElement::SetPropertyValuesEx($arFieldsTemp["ID"], false, array("PRICE" => $newPrice, "OLD_PRICE" => $oldPrice));
            //PropertyIndex\Manager::updateElementIndex("3", $arFieldsTemp["ID"]);               
            //сохраняем новую цену
            AddNewPrice($arFieldsTemp["ID"],$productCode,$today,$newPrice);         
            $arResult["UPD1"][] = array("ID" =>$arFieldsTemp["ID"], "OLD"=>$oldPrice,"NEW"=>$newPrice, "CODE" => $productCode);

               
         }
         //проверяем группу, если че, добавляем
         if($arFieldsTemp["IBLOCK_SECTION_ID"] != $sectionId){
            $db_old_groups = CIBlockElement::GetElementGroups($arFieldsTemp["ID"], true);
            $arOldGroups = array();
            while($ar_group = $db_old_groups->Fetch()){                  
               $arOldGroups[] = $ar_group["ID"];                  
            }
            if(!in_array($sectionId, $arOldGroups)){
               $arOldGroups[] = $sectionId;
               if(CIBlockElement::SetElementSection($arFieldsTemp["ID"], $arOldGroups)){
                  //PropertyIndex\Manager::updateElementIndex("3", $arFieldsTemp["ID"]);
               }
            }
         }
         //апдейтим дату изменения и пользователя
         $el->Update($arFieldsTemp["ID"], array("MODIFIED_BY"  => $USER->GetID()));      
         unset($arProductsTemp[$productCode]);      
      }
   }
   //добавляем новые, которых нет
   foreach ($arProductsTemp as $key => $arProduct) {      
      $arProp = $arProduct;
      unset($arProp["NAME"]);
      $arLoadProductArray = Array(      
        "IBLOCK_SECTION_ID" => $sectionId,          //
        "IBLOCK_ID"      => "3",      
        "PROPERTY_VALUES"=> $arProp,
        "NAME"           => $arProduct["NAME"],
        "ACTIVE"         => "Y"        
      );
      if($PRODUCT_ID = $el->Add($arLoadProductArray)){
         $arResult["NEW_GOOD_ADD"][] = $PRODUCT_ID;
         AddNewPrice($PRODUCT_ID,$arProp["CODE"],$today,$arProp["PRICE"]);         
      }
      else{
         $arResult["BAD_ADD_ERR"][] = $el->LAST_ERROR;   
         $arResult["NEW_BAD_ADD"][] = $arProduct;
      }      
   }   
   return $arResult;
}

function SectionAdd($arSections, $parentId = ''){
   $bs = new CIBlockSection;
   foreach ($arSections as $key => $arSection) {
      $arFields = Array(
        "ACTIVE" => "Y",        
        "IBLOCK_ID" => "3",
        "NAME" => $arSection['name'],          
      );   
      if($parentId != ""){
         $arFields["IBLOCK_SECTION_ID"] = $parentId;
      }   
      if(isset($arSection['link']) && $arSection['link'] != ''){
         $arFields["UF_LINK"] = $arSection["link"];
      }      
      $id = $bs->Add($arFields);
      if( $id>0 &&  (count($arSection["subsections"])>0)) {
         SectionAdd($arSection["subsections"],$id);
      }
   }
}

function CategoryAdd($category, $name){
   $name = DeleteFirstLastSpaces($name);
   $name = strtolower($name);
   $return = "";
   if($category == "brand"){
      $arBrands = GetBrands();
      $arBrands = $arBrands["ITEMS"];
      if(isset($arBrands[$name])){
         $return = $arBrands[$name]; 
      }
      else{
         $el = new CIBlockElement;
         $arLoad = Array(          
           "IBLOCK_ID"      => "4",            
           "NAME"           => my_mb_ucfirst($name),
           "ACTIVE"         => "Y"        
         );
         if($id = $el->Add($arLoad)){
            $return = $id;               
         }
         else{
            $return = $el->LAST_ERROR;               
         }
      }
   }
   elseif ($category == "firm") {
      $arFirms = GetFirms();
      $arFirms = $arFirms["ITEMS"];
      if(isset($arFirms[$name])){
         $return = $arFirms[$name]; 
      }
      else{
         $bs = new CIBlockSection;
         $arLoad = Array(          
           "IBLOCK_ID"      => "4",            
           "NAME"           => my_mb_ucfirst($name),
           "ACTIVE"         => "Y"        
         );
         if($id = $bs->Add($arLoad)){
            $return = $id;               
         }
         else{
            $return = 'error add';               
         }
      }      
   }
   elseif ($category == "country"){
      $arCountries = GetCountries();
      $arCountries = $arCountries["ITEMS"];
      if(isset($arCountries[$name])){
         $return = $arCountries[$name]; 
      }
      else{
         $el = new CIBlockElement;
         $arLoad = Array(          
           "IBLOCK_ID"      => "5",            
           "NAME"           => my_mb_ucfirst($name),
           "ACTIVE"         => "Y"        
         );
         if($id = $el->Add($arLoad)){
            $return = $id;               
         }
         else{
            $return = $el->LAST_ERROR;               
         }
      }
      
   }
   elseif ($category == "prop") {
      $arProps = GetProps();
      $arProps = $arProps["ITEMS"];
      if(in_array($name, $arProps)){
         $return = array_search($name, $arProps);         
      }
      else{                  
         $el = new CIBlockElement;
         $arLoad = Array(          
           "IBLOCK_ID"      => "9",            
           "NAME"           => $name,
           "ACTIVE"         => "Y"        
         );
         if($id = $el->Add($arLoad)){
            $return = $id;               
         }
         else{
            $return = $el->LAST_ERROR;               
         }
      }
   }
   elseif ($category == "measure") {
      $arMes = GetMesuares();
      $arMes = $arMes["ITEMS"];
      if(in_array($name, $arMes)){
         $return = array_search($name, $arMes);         
      }
      else{
         // $property = CIBlockProperty::GetByID("MEASURE2", "3")->GetNext();         
         // $ibpenum = new CIBlockPropertyEnum;
         // if($propEnumId = $ibpenum->Add(Array('PROPERTY_ID'=>$property['ID'], 'VALUE'=>strtolower($name)))){
         //    $return = $propEnumId;
         // }
         // else{
         //    $return = 'error add';         
         // }

         $el = new CIBlockElement;
         $arLoad = Array(          
           "IBLOCK_ID"      => "10",            
           "NAME"           => $name,
           "ACTIVE"         => "Y"        
         );
         if($id = $el->Add($arLoad)){
            $return = $id;               
         }
         else{
            $return = $el->LAST_ERROR;               
         }
      }      
   }
   elseif ($category == "color") {
      $arColors = GetColors();
      $arColors = $arColors["ITEMS"];
      if(in_array($name, $arColors)){
         $return = array_search($name, $arColors);         
      }
      else{
         // $property = CIBlockProperty::GetByID("COLOR", "3")->GetNext();         
         // $ibpenum = new CIBlockPropertyEnum;
         // if($propEnumId = $ibpenum->Add(Array('PROPERTY_ID'=>$property['ID'], 'VALUE'=>strtolower($name)))){
         //    $return = $propEnumId;
         // }
         // else{
         //    $return = 'error add';         
         // }
         $el = new CIBlockElement;
         $arLoad = Array(          
           "IBLOCK_ID"      => "11",            
           "NAME"           => $name,
           "ACTIVE"         => "Y"        
         );
         if($id = $el->Add($arLoad)){
            $return = $id;               
         }
         else{
            $return = $el->LAST_ERROR;               
         }
      }      
   }
   elseif ($category == "pack"){
      $arPacks = GetPacks();
      $arPacks = $arPacks["ITEMS"];
      if(isset($arPacks[$name])){
         $return = $arPacks[$name]; 
      }
      else{
         $el = new CIBlockElement;
         $arLoad = Array(          
           "IBLOCK_ID"      => "6",            
           "NAME"           => strtolower($name),
           "ACTIVE"         => "Y"        
         );
         if($id = $el->Add($arLoad)){
            $return = $id;               
         }
         else{
            $return = $el->LAST_ERROR;               
         }
      }
   }
   elseif ($category == "type"){
      $arTypes = GetTypes();
      $arTypes = $arTypes["ITEMS"];
      if(isset($arTypes[$name])){
         $return = $arTypes[$name]; 
      }
      else{
         $el = new CIBlockElement;
         $arLoad = Array(          
           "IBLOCK_ID"      => "8",            
           "NAME"           => my_mb_ucfirst($name),
           "ACTIVE"         => "Y"        
         );
         if($id = $el->Add($arLoad)){
            $return = $id;               
         }
         else{
            $return = $el->LAST_ERROR;               
         }
      }
   }

   return $return;
}

if($_REQUEST["action"] == "add_menu"){
   if(is_array($_REQUEST["sections"])){
      $parentId = 0;
      if($_REQUEST["site_shop"]=="av"){
         $parentId = 837;
      }
      elseif($_REQUEST["site_shop"]=="utka"){
         $parentId = 838;
      }
      elseif($_REQUEST["site_shop"] == "okey"){
         $parentId = 1576;
      }
      if($parentId>0){
         SectionAdd($_REQUEST["sections"], $parentId);      
         $arRes['OK'] = "ok";
      }
   }
}
elseif ($_REQUEST["action"] == "parse_av") {
   $arRes = 'notok';
   if(is_array($_REQUEST["section"])){
      $arSection = $_REQUEST["section"];
      if($arSection["UF_LINK"] != ""){
         $bPagerView = isset($_REQUEST["pager_view"])?true:false;
         $arRes = ParseAv($arSection["UF_LINK"], $bPagerView);
         $arRes["ADDING"] = AddProducts($arRes["PRODUCTS"], $arSection["ID"]);
         $arRes["SECTION"] = $_REQUEST["section"];
      }
   }   
}
elseif ($_REQUEST["action"] == "parse_utka") {
   $arRes = 'notok';
   if(is_array($_REQUEST["section"])){
      $arSection = $_REQUEST["section"];
      if($arSection["UF_LINK"] != ""){
         $bPagerView = isset($_REQUEST["pager_view"])?true:false;         
         $arRes = ParseUtka($arSection["UF_LINK"], $bPagerView);
         $arRes["ADDING"] = AddProducts($arRes["PRODUCTS"], $arSection["ID"]);   
         $arRes["SECTION"] = $_REQUEST["section"];      
      }
   }   
}
elseif ($_REQUEST["action"] == "parse_okey_msk") {
   $arRes = 'notok';
   if(is_array($_REQUEST["section"])){
      $arSection = $_REQUEST["section"];
      if($arSection["UF_LINK"] != ""){
         $bPagerView = isset($_REQUEST["pager_view"])?true:false;         
         $arRes = ParseOkeyMsk($arSection["UF_LINK"], $bPagerView);
         $arRes["ADDING"] = AddProducts($arRes["PRODUCTS"], $arSection["ID"]);
         $arRes["SECTION"] = $_REQUEST["section"];         
      }
   }   
}
elseif ($_REQUEST["action"] == "add_category") {
   if($_REQUEST["category"]!="" && $_REQUEST["name_category"]!=""){
      $arRes = CategoryAdd($_REQUEST["category"],$_REQUEST["name_category"]);
   }
   else{
      $arRes = 'empty_data';
   }
   
}
elseif ($_REQUEST["action"] == "update_item_av") {
   $arRes = 'ok';
   //$el = new CIBlockElement;
   $arProp = array(
      // "COUNTRY"=> 
   );
   CIBlockElement::SetPropertyValuesEx($_REQUEST["item"]["ID"], "3", $arProp);   
}
elseif($_REQUEST["action"] == "del_elem_pr"){   
   if(intval($_REQUEST["id"])>0){
      $el = new CIBlockElement;
      $el->Update(intval($_REQUEST["id"]), array("PREVIEW_TEXT" => ""));         
      $arRes = "del_pr_ok!";
   }
   else{
      $arRes = "bad_id";
   }
}
elseif($_REQUEST["action"] == "del_elem"){
   if(intval($_REQUEST["id"])>0){   
      CIBlockElement::Delete(intval($_REQUEST["id"]));      
      $arRes = "del_elem_ok!";
   }
   else{
      $arRes = "bad_id";
   }
}
else{
   $arRes = 'notaction';
}
?>


<?$APPLICATION->RestartBuffer();?>
<?
echo CUtil::PhpToJSObject($arRes);
die();
?>
Если блог был полезным, можете угостить меня "чашечкой кофе" :)

Сбер по номеру телефона +7 (953) 585-13-09 Вероника.
Спасибо!