这段代码:
{-# LANGUAGE BangPatterns #-}
module Main where
import Data.Bits
import Data.Word
import Control.Monad
import System.CPUTime
import Data.List
-- The Damenproblem.
-- Wiki: https://de.wikipedia.org/wiki/Damenproblem
main :: IO ()
main = do
start <- getCPUTime
print $ dame 14
end <- getCPUTime
print $ "Needed " ++ (show ((fromIntegral (end - start)) / (10^12))) ++ " Seconds"
type BitState = (Word64, Word64, Word64)
dame :: Int -> Int
dame max = foldl' (+) 0 $ map fn row
where fn x = recur (max - 2) $ nextState (x, x, x)
recur !depth !state = foldl' (+) 0 $ flip map row $ getPossible depth (getStateVal state) state
getPossible depth !stateVal state bit
| (bit .&. stateVal) > 0 = 0
| depth == 0 = 1
| otherwise = recur (depth - 1) (nextState (addBitToState bit state))
row = take max $ iterate moveLeft 1
getStateVal :: BitState -> Word64
getStateVal (l, r, c) = l .|. r .|. c
addBitToState :: Word64 -> BitState -> BitState
addBitToState l (ol, or, oc) = (ol .|. l, or .|. l, oc .|. l)
nextState :: BitState -> BitState
nextState (l, r, c) = (moveLeft l, moveRight r, c)
moveRight :: Word64 -> Word64
moveRight x = shiftR x 1
moveLeft :: Word64 -> Word64
moveLeft x = shift x 1大约需要60秒才能执行。如果我用-O2启用编译器优化,大约需要7秒。-O1更快,大约需要5秒。测试了这段代码的java版本,用for循环代替了映射列表,大约需要1秒(!)。我一直在尽我最大的努力优化,但我在网上找到的提示都没有超过半秒的帮助。请帮帮忙
编辑: Java版本:
public class Queens{
static int getQueens(){
int res = 0;
for (int i = 0; i < N; i++) {
int pos = 1 << i;
res += run(pos << 1, pos >> 1, pos, N - 2);
}
return res;
}
static int run(long diagR, long diagL, long mid, int depth) {
long valid = mid | diagL | diagR;
int resBuffer = 0;
for (int i = 0; i < N; i++) {
int pos = 1 << i;
if ((valid & pos) > 0) {
continue;
}
if (depth == 0) {
resBuffer++;
continue;
}
long n_mid = mid | pos;
long n_diagL = (diagL >> 1) | (pos >> 1);
long n_diagR = (diagR << 1) | (pos << 1);
resBuffer += run(n_diagR, n_diagL, n_mid, depth - 1);
}
return resBuffer;
}
}编辑:在3.2 ghc的i5 650上运行GHC8.4.1。
发布于 2018-04-05 00:51:42
假设您的算法是正确的(我还没有验证这一点),我能够得到一致的900ms (比Java实现还快!)。在我的机器上,-O2和-O3都是可比的。
值得注意的变化:(编辑:最重要的变化:从List切换到Vector)切换到GHC8.4.1,使用严格,BitState现在是一个严格的三元组使用Vectors很重要,以实现更好的速度-在我看来,你无法达到仅使用链表的速度,即使融合。未装箱的矢量很重要,因为您知道矢量将始终是Word64s或Ints。
{-# LANGUAGE BangPatterns #-}
module Main (main) where
import Data.Bits ((.&.), (.|.), shiftR, shift)
import Data.Vector.Unboxed (Vector)
import qualified Data.Vector.Unboxed as Vector
import Data.Word (Word64)
import Prelude hiding (max, sum)
import System.CPUTime (getCPUTime)
--
-- The Damenproblem.
-- Wiki: https://de.wikipedia.org/wiki/Damenproblem
main :: IO ()
main = do
start <- getCPUTime
print $ dame 14
end <- getCPUTime
print $ "Needed " ++ (show ((fromIntegral (end - start)) / (10^12))) ++ " Seconds"
data BitState = BitState !Word64 !Word64 !Word64
bmap :: (Word64 -> Word64) -> BitState -> BitState
bmap f (BitState x y z) = BitState (f x) (f y) (f z)
{-# INLINE bmap #-}
bfold :: (Word64 -> Word64 -> Word64) -> BitState -> Word64
bfold f (BitState x y z) = x `f` y `f` z
{-# INLINE bfold #-}
singleton :: Word64 -> BitState
singleton !x = BitState x x x
{-# INLINE singleton #-}
dame :: Int -> Int
dame !x = sumWith fn row
where
fn !x' = recur (x - 2) $ nextState $ singleton x'
getPossible !depth !stateVal !state !bit
| (bit .&. stateVal) > 0 = 0
| depth == 0 = 1
| otherwise = recur (depth - 1) (nextState (addBitToState bit state))
recur !depth !state = sumWith (getPossible depth (getStateVal state) state) row
!row = Vector.iterateN x moveLeft 1
sumWith :: (Vector.Unbox a, Vector.Unbox b, Num b) => (a -> b) -> Vector a -> b
sumWith f as = Vector.sum $ Vector.map f as
{-# INLINE sumWith #-}
getStateVal :: BitState -> Word64
getStateVal !b = bfold (.|.) b
addBitToState :: Word64 -> BitState -> BitState
addBitToState !l !b = bmap (.|. l) b
nextState :: BitState -> BitState
nextState !(BitState l r c) = BitState (moveLeft l) (moveRight r) c
moveRight :: Word64 -> Word64
moveRight !x = shiftR x 1
{-# INLINE moveRight #-}
moveLeft :: Word64 -> Word64
moveLeft !x = shift x 1
{-# INLINE moveLeft #-}我用ghc dame.hs -O2 -fforce-recomp -ddump-simpl -dsuppress-all检查了内核,它看起来很好(也就是说,一切都拆箱了,循环看起来很好)。我曾担心getPossible的部分应用可能会有问题,但事实证明并非如此。我觉得如果我能更好地理解算法,就有可能以更好/更高效的方式编写代码,但我并不太担心--这仍然可以击败Java实现。
https://stackoverflow.com/questions/49652730
复制相似问题