日安,
我正在使用cURL和各种解析技术从各种网站检索信息。我做了代码,以便我可以,如果需要,添加更多的网站,我扫描信息。
检索到的信息如下:(请注意,信息可能不准确,可能不反映真实价格/名称)
Array
(
[website1.com] => Array
(
[0] => Array
(
[0] => 60" BRAVIA LX900 Series 3D HDTV
[1] => website1.com
[2] => 5299.99
)
[1] => Array
(
[0] => 52" BRAVIA LX900 Series 3D HDTV
[1] => website1.com
[2] => 4499.99
)
[2] => Array
(
[0] => 46" BRAVIA LX900 Series 3D HDTV
[1] => website1.com
[2] => 3699.99
)
[3] => Array
(
[0] => 40" BRAVIA LX900 Series 3D HDTV
[1] => website1.com
[2] => 2999.99
)
)
[website2.com] => Array
(
[0] => Array
(
[0] => Sony 3D 60" LX900 HDTV BRAVIA
[1] => website2.com
[2] => 5400.99
)
[1] => Array
(
[0] => Sony 3D 52" LX900 HDTV BRAVIA
[1] => website2.com
[2] => 4699.99
)
[2] => Array
(
[0] => Sony 3D 46" LX900 HDTV BRAVIA
[1] => website2.com
[2] => 3899.99
)
)
)所需的产出必须是:
Array
(
[0] => Array
(
[Name] => 60" BRAVIA LX900 Series 3D HDTV
[website1.com] => 5299.99
[website2.com] => 5400.99
)
[1] => Array
(
[Name] => 52" BRAVIA LX900 Series 3D HDTV
[website1.com] => 4499.99
[website2.com] => 4699.99
)
[2] => Array
(
[Name] => 46" BRAVIA LX900 Series 3D HDTV
[website1.com] => 3699.99
[website2.com] => 3899.99
)
[3] => Array
(
[Name] => 40" BRAVIA LX900 Series 3D HDTV
[website1.com] => 2999.99
)
)请注意,名称可能会有所不同,因此需要使用similar_text。此外,一些信息可能不会显示在所有的网站。我知道只有一个电视名称必须选择,然后我将使用一个从最相关的来源(website1.com)
这是我想要做的代码。
<?php
$_Retreived = array(
"website1.com" => array(
array('60" BRAVIA LX900 Series 3D HDTV', 'website1.com', 5299.99),
array('52" BRAVIA LX900 Series 3D HDTV', 'website1.com', 4499.99),
array('46" BRAVIA LX900 Series 3D HDTV', 'website1.com', 3699.99),
array('40" BRAVIA LX900 Series 3D HDTV', 'website1.com', 2999.99)
),
"website2.com" => array(
array('Sony 3D 60" LX900 HDTV BRAVIA', 'website2.com', 5400.99),
array('Sony 3D 52" LX900 HDTV BRAVIA', 'website2.com', 4699.99),
array('Sony 3D 46" LX900 HDTV BRAVIA', 'website2.com', 3899.99),
)
);
$_Prices = array();
$_PricesTemp = array();
$_Sites = array("website1.com", "website2.com");
for($i = 0; $i < sizeOf($_Sites); $i++)
{
$_PricesTemp = array_merge($_PricesTemp, $_Retreived[ $_Sites[$i] ]);
}
/*
print_r($_PricesTemp);
Array
(
[0] => Array
(
[0] => 60" BRAVIA LX900 Series 3D HDTV
[1] => website1.com
[2] => 5299.99
)
[1] => Array
(
[0] => 52" BRAVIA LX900 Series 3D HDTV
[1] => website1.com
[2] => 4499.99
)
[2] => Array
(
[0] => 46" BRAVIA LX900 Series 3D HDTV
[1] => website1.com
[2] => 3699.99
)
[3] => Array
(
[0] => 40" BRAVIA LX900 Series 3D HDTV
[1] => website1.com
[2] => 2999.99
)
[4] => Array
(
[0] => Sony 3D 60" LX900 HDTV BRAVIA
[1] => website2.com
[2] => 5400.99
)
[5] => Array
(
[0] => Sony 3D 52" LX900 HDTV BRAVIA
[1] => website2.com
[2] => 4699.99
)
[6] => Array
(
[0] => Sony 3D 46" LX900 HDTV BRAVIA
[1] => website2.com
[2] => 3899.99
)
)
*/
foreach($_PricesTemp As $_KeyOne => $_EntryOne)
{
foreach(array_reverse($_PricesTemp, true) As $_KeyTwo => $_EntryTwo)
{
if ($_KeyOne != $_KeyTwo)
{
$_Percent = 0;
similar_text(strtoupper($_EntryOne[0]), strtoupper($_EntryTwo[0]), $_Percent);
if ($_Percent >= 90) //If names matches 90%+
{
echo "Similar : <b>" . $_KeyOne . "</b> " . $_EntryOne[0] . " and <b>" . $_KeyTwo . "</b> " . $_EntryTwo[0] . " Percent : " . $_Percent . "<br />";
$_Prices[] = array();
$_Prices[ sizeOf($_Prices)-1 ]['Name'] = $_EntryOne[0]; //Use the product name of the most revelant website (website1.com)
foreach($_Sites As $_Site)
{
if (isset($_EntryOne[ 1 ]) && $_EntryOne[ 1 ] == $_Site) //Check if it contains price from website1.com
{
$_Prices[ sizeOf($_Prices)-1 ][ $_Site ] = $_EntryOne[ 2 ];
}
if (isset($_EntryTwo[ 1 ]) && $_EntryTwo[ 1 ] == $_Site) //Check if it contains price from website2.com
{
$_Prices[ sizeOf($_Prices)-1 ][ $_Site ] = $_EntryTwo[ 2 ];
}
}
}
}
}
}
/*
print_r($_Prices);
Array
(
[0] => Array
(
[Name] => 60" BRAVIA LX900 Series 3D HDTV
[website1.com] => 2999.99
)
[1] => Array
(
[Name] => 60" BRAVIA LX900 Series 3D HDTV
[website1.com] => 3699.99
)
[2] => Array
(
[Name] => 60" BRAVIA LX900 Series 3D HDTV
[website1.com] => 4499.99
)
[3] => Array
(
[Name] => 52" BRAVIA LX900 Series 3D HDTV
[website1.com] => 2999.99
)
[4] => Array
(
[Name] => 52" BRAVIA LX900 Series 3D HDTV
[website1.com] => 3699.99
)
[5] => Array
(
[Name] => 52" BRAVIA LX900 Series 3D HDTV
[website1.com] => 5299.99
)
[6] => Array
(
[Name] => 46" BRAVIA LX900 Series 3D HDTV
[website1.com] => 2999.99
)
[7] => Array
(
[Name] => 46" BRAVIA LX900 Series 3D HDTV
[website1.com] => 4499.99
)
[8] => Array
(
[Name] => 46" BRAVIA LX900 Series 3D HDTV
[website1.com] => 5299.99
)
[9] => Array
(
[Name] => 40" BRAVIA LX900 Series 3D HDTV
[website1.com] => 3699.99
)
[10] => Array
(
[Name] => 40" BRAVIA LX900 Series 3D HDTV
[website1.com] => 4499.99
)
[11] => Array
(
[Name] => 40" BRAVIA LX900 Series 3D HDTV
[website1.com] => 5299.99
)
[12] => Array
(
[Name] => Sony 3D 60" LX900 HDTV BRAVIA
[website2.com] => 3899.99
)
[13] => Array
(
[Name] => Sony 3D 60" LX900 HDTV BRAVIA
[website2.com] => 4699.99
)
[14] => Array
(
[Name] => Sony 3D 52" LX900 HDTV BRAVIA
[website2.com] => 3899.99
)
[15] => Array
(
[Name] => Sony 3D 52" LX900 HDTV BRAVIA
[website2.com] => 5400.99
)
[16] => Array
(
[Name] => Sony 3D 46" LX900 HDTV BRAVIA
[website2.com] => 4699.99
)
[17] => Array
(
[Name] => Sony 3D 46" LX900 HDTV BRAVIA
[website2.com] => 5400.99
)
)
*/
?>首先,上面的代码不起作用。一定有个逻辑上的错误,我不能指手画脚。另外,我不相信代码会在我添加第三个网站到列表中的情况下起作用。
伙计们有什么想法吗?从今天早上开始我就一直在做这个。
编辑2011-02-16:
我在这个问题上加了一个赏金。
发布于 2011-02-19 06:57:09
尝试一下这个要点更清晰的https://gist.github.com/835099
这给我带来了你想要的结果。
发布于 2010-09-09 16:47:19
高层次的概述应如下所示:
与similar_text()不同,您应该考虑使用levenshtein(),这在实践中是相似的,但速度却相当快。
下面是一些(未经测试的,现场)代码:
$levThreshold = 3 ;
$_Prices = array() ;
foreach ($_Retreived as $website => $websiteItems) {
$currName = $websiteItems[0] ;
$currWebsite = $websiteItems[1] ;
$currPrice = $websiteItems[2] ;
$foundItemKey = false ;
//check current price structure. Get $priceData by reference
//so we can modify it in the loop and keep the changed instead
//of the loop copy.
foreach ($_Prices as &$priceData) {
if (isset($priceData[$website])) {
//already done this
continue ;
}
//check if this is the item name we are looping over
$lev = levenshtein($priceData['Name'], $currName) ;
if ($lev < $levThreshold) {
//item exists, add price and break
$priceData[$website] = $currPrice ;
$foundItemKey = true ;
break ;
}
}
//if we haven't found the item key, create a new one
if (!$foundItemKey) {
$newItem = array() ;
$newItem['Name'] = $currName ;
$newItem[$website] = $currPrice ;
$_Prices[] = $newItem ;
}
}$levThreshold是两个字符串之间必须有不同的字符数,才能被认为不同。你可以相应地调整它。
发布于 2011-02-21 20:30:08
使用similar_text无法回答这个问题。您希望将60" BRAVIA LX900 Series 3D HDTV与Sony 3D 60" LX900 HDTV BRAVIA匹配。然而,60" BRAVIA LX900 Series 3D HDTV实际上更类似于52" BRAVIA LX900 Series 3D HDTV,只有两个字符不同。
我怀疑您将需要一个自定义处理程序来匹配特定于您要匹配的产品的详细信息。例如,对于电视机,您可能希望匹配大小(xx")和产品系列(BRAVIA LX900)。
这并不能解决你的问题,但我害怕答案。
https://stackoverflow.com/questions/3678457
复制相似问题