module globals
! Global variables
use omp_lib                                    ! help the compiler find the OMP libraries
implicit none
integer :: n = 1000
double precision, parameter :: L=1.0d0
double precision, parameter :: pi=2q0*asin(1q0) ! numerical constant
end module globals

module Langevin
! Initialization and update rule for Langevin particles
use globals
implicit none
logical, allocatable, dimension(:) :: is_tracked
double precision :: dt,kT,g,m,sigma,eps,rc      ! time step size and physical parameters
double precision :: pref1,pref2                 ! auxiliary parameters
double precision, allocatable, dimension(:) :: x,y,vx,vy,ax,ay,vhx,vhy,x0,y0 ! particle positions, accellerations, velocities, half-step velocities, initial positions
contains
subroutine set_parameters

! Set time step and physical parameters
dt=0.00001d0 ! time step size
kT=0.1d0    ! energy
g=0.1d0     ! drag coefficient
m=1d0     ! mass of the particles, can be normalized to 1.
sigma=1.0d-3              ! Potential parameters
eps=1d0
rc=sigma*(2.0d0**(1.0d0/6.0d0)) ! Effective particle size

! Set auxiliary parameters
pref1=g
pref2=sqrt(24d0*kT*g/dt)

end subroutine set_parameters
subroutine initialize_particles
integer :: i, j, k, nx, ix, iy
double precision :: ran1(n),ran2(n),gr1(n),gr2(n),dist, lx, ly
logical, allocatable, dimension(:,:) :: is_filled
! Give particles initial position and velocity

   call random_seed()
   call random_number(ran1)                       ! uses the built-in PRNG, easy but not very accurate
   call random_number(ran2)
   !x=L*(ran1-0.5d0)
   !x0=x
   !y=L*(ran2-0.5d0)
   !y0=y

   !> Basic box stacking with no RNG
   !nx = ceiling(sqrt(real(n)))
   !dist = (0.9*L)/nx
   !k = 1
   !outer: do i = 1,nx
   ! do j = 1,nx
   !     x(k) = i*dist-L/2
   !     y(k) = j*dist-L/2
   !     k = k + 1
   !     if (k.eq.n) exit outer
   ! end do
   !end do outer
   
   !> Box RNG on a finer mesh 
   nx = ceiling(L/rc)
   dist = 0.9*L/nx

   allocate(is_filled(nx,nx))
   k = 1

   !> Assuming (nx)*(nx) grid, first scale random numbers to a max of nx
   do while (k .lt. n)
       !> Random numbers
       call random_number(lx)
       call random_number(ly)

       !> Scale them to be grid points
       ix = ceiling(lx*nx)
       iy = ceiling(ly*nx)

       !> Check if grid is occupied
       if (is_filled(ix,iy)) cycle

       !> If not, set the particle position and call it a day
       x(k) = ix*rc - L/2
       y(k) = iy*rc - L/2

       !> And mark the spot as filled
       is_filled(ix,iy) = .true.

       !> Increment!!
       k = k + 1
   end do

   ax=0d0
   ay=0d0
   call random_number(ran1)
   call random_number(ran2)
   gr1=sqrt(kT/(m))*sqrt(-2*log(ran1))*cos(2*pi*ran2) ! Box-Mueller transform
   gr2=sqrt(kT/(m))*sqrt(-2*log(ran1))*sin(2*pi*ran2)
   vx=gr1
   vy=gr2

end subroutine initialize_particles
end module Langevin

module domainDecomposition
  use globals
  use Langevin
  implicit none
   integer :: b=4
   integer, allocatable :: nbl(:,:)
contains
   subroutine set_sector_grid(side_count)
      integer, intent(in) :: side_count

      if (side_count .le. 0) then
          print *, 'Error: sector side count must be > 0.'
          stop 1
      end if

      b = side_count
      if (allocated(nbl)) deallocate(nbl)
      allocate(nbl(0:b*b-1,9))
      nbl = -1

      
   end subroutine set_sector_grid

  include "neighbourList.f90"
  include "putParticleInBox.f90"
  include "sortParticles.f90"
end module domainDecomposition

module BC
   ! Subroutines related to the boundary conditions
   use globals
   use langevin
   implicit none
contains
   subroutine impose_BC(i)
     integer :: i
     !> Recall we are inside an LxL box centered at the origin

     !> Top boundary
     if(y(i) .GT. L/2) then
         y(i) = L - y(i)
         vhy(i) = -vhy(i)
     end if

     !> Left boundary
     if(x(i) .LT. -L/2) then
         x(i) = -L - x(i)
         vhx(i) = -vhx(i)
     end if

     !> Bottom boundary
     if(y(i) .LT. -L/2) then
         y(i) = -L - y(i)
         vhy(i) = -vhy(i)
     end if

     !> Right boundary
     if(x(i) .GT. L/2) then
         x(i) = L - x(i)
         vhx(i) = -vhx(i)
     end if

     if(abs(x(i)).gt.L/2 .or. abs(y(i)).gt.L/2) is_tracked(i)=.false.

   end subroutine impose_BC
end module BC

program main

use globals
use domainDecomposition
use Langevin
use BC
implicit none
integer :: i,j,s,ns,p1,p2,nthreads,sector_side
integer, allocatable :: lim(:,:)
double precision :: t,t_max,m1,m2,rx,ry,dij,F,msd
double precision :: wtime,begin,end,Tint,TsinceWrite
integer :: NumParticles, NumThreads, NumSectors, tsteps
double precision :: moveTime,sortTime,interTime,reactTime,writeTime,rngTime,initTime,overTime,totalTime,wallTime
double precision :: t0_move,t1_move,t0_sort,t1_sort,t0_inter,t1_inter,t0_react,t1_react,t0_write,t1_write,t0_rng,t1_rng,t0_init,t1_init,t0_over,t1_over
double precision, allocatable, dimension(:) :: ran1,ran2,xscrap,yscrap,vxscrap,vyscrap
double precision :: cellcount


begin = omp_get_wtime()

! Set all times to 0
moveTime=0d0   ! Time spent moving particles (position and halfstep velocity updates)
sortTime=0d0   ! Time spent sorting particles into sectors
interTime=0d0  ! Time spent computing interactions (force updates)
reactTime=0d0  ! Time spent handling reactions (accel and fullstep velocity updates)
writeTime=0d0  ! Time spent writing output
rngTime=0d0    ! Time spent generating random numbers
initTime=0d0   ! Time spent initializing particles
overTime=0d0   ! Time spent on overhead tasks
totalTime=0d0  ! Total simulation time accounted for (sum of all above)
wallTime=0d0   ! Total wall clock time for the simulation measured

t0_over = omp_get_wtime()
! Open files
open(11,file='trajectories.xyz')
open(12,file='means.out')
open(13,file='kinetic_energies.out')
open(14,file='data/timings.csv', position='append')
t1_over = omp_get_wtime()
overTime = overTime + (t1_over-t0_over)

t0_init = omp_get_wtime()
call parse_command_line(nthreads,sector_side)
call set_sector_grid(sector_side)
call omp_set_num_threads(nthreads)

! Allocate arrays
allocate(x(n),y(n),vx(n),vy(n),ax(n),ay(n),vhx(n),vhy(n),x0(n),y0(n),is_tracked(n),ran1(n),ran2(n),xscrap(n),yscrap(n),vxscrap(n),vyscrap(n))
allocate(lim(0:b*b,2))

call buildNBL()

is_tracked = .True.
t=0d0
tsteps=0
t_max= 4.0d0     ! integration time
Tint = 0.001d0
tsinceWrite=0d0
call set_parameters
call initialize_particles
t1_init = omp_get_wtime()
initTime = initTime + (t1_init-t0_init)

print *, 'Starting simulation with rc=',rc,', sigma=',sigma,' and L/b=',L/real(b),'.'

! Check if side length is smaller than cutoff radius
cellcount = real(sector_side)
if (L/cellcount .lt. rc) then
      print *, 'Error: sector size (L/b) must be >= cutoff radius (rc).'
      print *, 'Maximum sectors per dimension for L = ',L,' and rc = ',rc,' is ',ceiling(L/rc)
      stop 1
end if

print *, 'Running simulation with n=',n,' particles, b=',b,' sectors per dimension, and using ',omp_get_max_threads(),' threads.'
print *, "rc=",rc," and L/b=",L/real(sector_side)

x0 = x
y0 = y

!$omp parallel 
do while(t.lt.t_max)

   ! one thread: fetch pseudo-random numbers
   ! one thread: update velocity, position, impose BC
   xscrap=x
   yscrap=y
   vxscrap=vx
   vyscrap=vy
   !$omp sections 
   !$omp section  ! Write to disk
      t0_write = omp_get_wtime()
       if(tSinceWrite.gt.Tint) then
         msd = 0d0 ! reset msd
         write(11,'(i0)') count(is_tracked)
         write(11,'(f10.5)') t
         do i=1,n
            ! Write traj line like this: "P0 x y z" where P is a literal character, 0 is the particle index, and x,y,z are the coordinates. Use fixed format with 5 decimal places for the coordinates and a field width of 10 characters for each coordinate, and separate them by a single space.
            write(11,'("P",i0,1x,f10.5,1x,f10.5,1x,f10.5)') i, xscrap(i),yscrap(i),0d0
            msd = msd + ((xscrap(i)-x0(i))**2 + (yscrap(i)-y0(i))**2)
         enddo
         
         write(12,*) t,sqrt(msd/real(count(is_tracked),8)),count(is_tracked)
         write(13,*) t,sum(0.5*vx**2 + 0.5*vy**2, mask=is_tracked)
         tSinceWrite=0d0
      end if
      t1_write = omp_get_wtime()
      writeTime = writeTime + (t1_write-t0_write)
   !$omp section
      t0_rng = omp_get_wtime()
      call random_number(ran1)
      ran1=ran1-0.5d0
      call random_number(ran2)
      ran2=ran2-0.5d0
      t1_rng = omp_get_wtime()
      rngTime = rngTime + (t1_rng-t0_rng)
   !$omp section
      t0_move = omp_get_wtime()
      vhx=vx+0.5d0*ax*dt
      vhy=vy+0.5d0*ay*dt
      x=x+vhx*dt
      y=y+vhy*dt
      do j=1,n
         call impose_BC(j)
      end do
      t1_move = omp_get_wtime()
      moveTime = moveTime + (t1_move-t0_move)

      t0_sort = omp_get_wtime()
      call order(x,y,vx,vy,x0,y0,vhx,vhy,lim) ! BUGFIX 11/03: half-step velocities must also be re-ordered.
      t1_sort = omp_get_wtime()
      sortTime = sortTime + (t1_sort-t0_sort)
      ax=0d0                   ! Add forces here if any
      ay=0d0                   ! Add forces here if any
      
   !$omp end sections
   !$omp flush

   ! Our first attempt at parallelization of the code: run the computation of the distances and interaction forces on multiple threads:

   !$omp single
   t0_inter = omp_get_wtime()
   !$omp end single

   !$omp do private(s,i,ns,p1,p2,rx,ry,dij,F)
      do s=0,b*b-1
         do i=1,9
            if(nbl(s,i).eq.-1) exit
            ns=nbl(s,i)  ! BUGFIX 11/03: the loop counter was used as sector index instead of the entries of nbl.
            do p1=lim(s,1),lim(s,2)
               do p2=lim(ns,1),lim(ns,2)
                  if(p1.eq.p2) cycle
                  rx=x(p2)-x(p1)
                  ry=y(p2)-y(p1)
                  dij=sqrt(rx**2 + ry**2)
                  if(dij.lt.rc) then
                     F=4d0*eps*( -12d0*sigma**12/dij**13 + 6D0* sigma**6/dij**7 )
                     ax(p1)=ax(p1)+F*rx/(dij*m)
                     ay(p1)=ay(p1)+F*ry/(dij*m)
                  end if
               end do
            end do
         end do
         
      end do
   !$omp end do
   
   !$omp single
   t1_inter = omp_get_wtime()
   interTime = interTime + (t1_inter - t0_inter)
   !$omp end single

   
   !$omp single
      t0_react = omp_get_wtime()
      vx=vhx+0.5d0*ax*dt
      vy=vhy+0.5d0*ay*dt
      t=t+dt
      tSinceWrite=tSinceWrite+dt
      t1_react = omp_get_wtime()
      reactTime = reactTime + (t1_react - t0_react)
   !$omp end single
  
   tsteps = tsteps + 1
end do
!$omp end parallel

print *, 'Time-stepping loop finished after ', tsteps, ' steps.'
end = omp_get_wtime()
wallTime = end - begin
totalTime = moveTime + sortTime + interTime + reactTime + writeTime + rngTime + initTime + overTime
!call cpu_time(end)
print *,'Parameters:'
print *,'NumParticles=',n
print *,'NumThreads=',nthreads
print *,'NumSectors per dim=',b
print *,'Timing:'
print *,'InitTime=',initTime
print *,'WriteTime=',writeTime
print *,'RNGTime=',rngTime
print *,'MoveTime=',moveTime
print *,'SortTime=',sortTime
print *,'InterTime=',interTime
print *,'ReactTime=',reactTime
print *,'OverheadTime=',overTime
print *,'TotalTime=',totalTime
print *,'WallTime=',wallTime

! Append times to csvfile with each line as: NumParticles,NumThreads,NumSectors,wallTime,totalTime,moveTime,sortTime,interTime,reactTime,writeTime,rngTime,initTime,overTime
! with commas as separators
NumParticles = n
NumThreads = nthreads
NumSectors = b*b
write(14,'("FullPara,",i0,",",i0,",",i0,",",f10.5,",",f10.5,",",f10.5,",",f10.5,",",f10.5,",",f10.5,",",f10.5,",",f10.5,",",f10.5,",",f10.5)') &
   NumParticles, NumThreads, NumSectors, wallTime, totalTime, moveTime, sortTime, interTime, reactTime, writeTime, rngTime, initTime, overTime


if(lim(b*b,2).gt.lim(b*b,1)) print '(a5,i7,a8,i8,a11)','Lost ',lim(b*b,2)-lim(b*b,1),' out of ',n,' particles.'
! De-allocate arrays
deallocate(x,y,vx,vy,ax,ay,x0,y0,is_tracked,ran1,ran2,xscrap,yscrap,vxscrap,vyscrap)
if (allocated(lim)) deallocate(lim)
if (allocated(nbl)) deallocate(nbl)
! Close files
close(11)
close(12)
close(13)
close(14)

contains

subroutine parse_command_line(nthreads_out,sector_side_out)
   integer, intent(out) :: nthreads_out,sector_side_out
   integer :: argc,idx,parsed_value
   character(len=64) :: arg,value

   nthreads_out = omp_get_max_threads()
   sector_side_out = b

  argc = command_argument_count()
  idx = 1
  do while (idx .le. argc)
     call get_command_argument(idx,arg)
     select case (trim(arg))
     case ('-N')
        if (idx .ge. argc) call usage_and_stop('Missing value for -N')
        idx = idx + 1
        call get_command_argument(idx,value)
        call parse_positive_integer(value,'-N',parsed_value)
        n = parsed_value
     case ('-P')
        if (idx .ge. argc) call usage_and_stop('Missing value for -P')
        idx = idx + 1
        call get_command_argument(idx,value)
        call parse_positive_integer(value,'-P',parsed_value)
        nthreads_out = parsed_value
     case ('-S')
        if (idx .ge. argc) call usage_and_stop('Missing value for -S')
        idx = idx + 1
        call get_command_argument(idx,value)
        call parse_positive_integer(value,'-S',parsed_value)
        sector_side_out = parsed_value
     case default
        call usage_and_stop('Unknown argument: '//trim(arg))
     end select
     idx = idx + 1
  end do
end subroutine parse_command_line

subroutine parse_positive_integer(text,flag_name,parsed_value)
  character(len=*), intent(in) :: text,flag_name
  integer, intent(out) :: parsed_value
  integer :: io

  read(text,*,iostat=io) parsed_value
  if (io .ne. 0 .or. parsed_value .le. 0) then
     call usage_and_stop('Invalid value for '//trim(flag_name)//': '//trim(text))
  end if
end subroutine parse_positive_integer

subroutine usage_and_stop(message)
  character(len=*), intent(in) :: message

  print *, trim(message)
  print *, 'Usage: ./simulationLD [-N <num>] [-P <num>] [-S <num>]'
  print *, '  -N <num>  Number of particles'
  print *, '  -P <num>  Number of OpenMP threads'
   print *, '  -S <num>  Number of sectors per dimension (total = S*S)'
  stop 1
end subroutine usage_and_stop

end program main