aomp icon indicating copy to clipboard operation
aomp copied to clipboard

[aomp] Memory access fault with custom mapper

Open tpkessler opened this issue 5 years ago • 12 comments
trafficstars

I'm using AOMP 11.7-1 STANDALONE compiled from source on Arch Linux. When compiling OpenMP code with a custom mapper I always get a memory access fault.

The following code is taken from the official OpenMP 5.0 examples:

#include <stdlib.h>
#include <stdio.h>
#define N 100
typedef struct myvec{
    size_t len;
    double *data;
} myvec_t;

#pragma omp declare mapper(myvec_t v) \
    map(v, v.data[0:v.len])
void init(myvec_t *s);

int main(){
    myvec_t s;
    s.data = (double *)calloc(N,sizeof(double));
    s.len = N;
    #pragma omp target map(s)
    init(&s);
    
    printf("s.data[%d]=%lf\n",N-1,s.data[N-1]);
}

void init(myvec_t *s)
{ for(int i=0; i<s->len; i++) s->data[i]=i; }
$ $AOMP/bin/hipcc -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=$($AOMP/bin/mygpu) target_mapper.1.c -o target_mapper.1
$ ./target_mapper.1
[GPU Memory Error] Addr: 0x1888000 Reason: Page not present or supervisor privilege. 
Memory access fault by GPU node-1 (Agent handle: 0x188db50) on address 0x1888000. Reason: Page not present or supervisor privilege.
[1]    12891 abort (core dumped)  ./target_mapper.1

My system:

$ $AOMP/bin/rocminfo
ROCk module is loaded
Able to open /dev/kfd read-write
=====================    
HSA System Attributes    
=====================    
Runtime Version:         1.1
System Timestamp Freq.:  1000.000000MHz
Sig. Max Wait Duration:  18446744073709551615 (0xFFFFFFFFFFFFFFFF) (timestamp count)
Machine Model:           LARGE                              
System Endianness:       LITTLE                             

==========               
HSA Agents               
==========               
*******                  
Agent 1                  
*******                  
  Name:                    AMD Ryzen 7 2700X Eight-Core Processor
  Uuid:                    CPU-XX                             
  Marketing Name:          AMD Ryzen 7 2700X Eight-Core Processor
  Vendor Name:             CPU                                
  Feature:                 None specified                     
  Profile:                 FULL_PROFILE                       
  Float Round Mode:        NEAR                               
  Max Queue Number:        0(0x0)                             
  Queue Min Size:          0(0x0)                             
  Queue Max Size:          0(0x0)                             
  Queue Type:              MULTI                              
  Node:                    0                                  
  Device Type:             CPU                                
  Cache Info:              
    L1:                      32768(0x8000) KB                   
  Chip ID:                 0(0x0)                             
  Cacheline Size:          64(0x40)                           
  Max Clock Freq. (MHz):   3700                               
  BDFID:                   0                                  
  Internal Node ID:        0                                  
  Compute Unit:            16                                 
  SIMDs per CU:            0                                  
  Shader Engines:          0                                  
  Shader Arrs. per Eng.:   0                                  
  WatchPts on Addr. Ranges:1                                  
  Features:                None
  Pool Info:               
    Pool 1                   
      Segment:                 GLOBAL; FLAGS: KERNARG, FINE GRAINED
      Size:                    32896456(0x1f5f5c8) KB             
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
    Pool 2                   
      Segment:                 GLOBAL; FLAGS: COARSE GRAINED      
      Size:                    32896456(0x1f5f5c8) KB             
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
  ISA Info:                
    N/A                      
*******                  
Agent 2                  
*******                  
  Name:                    gfx900                             
  Uuid:                    GPU-0213f2a912ee21a4               
  Marketing Name:          Vega 10 XL/XT [Radeon RX Vega 56/64]
  Vendor Name:             AMD                                
  Feature:                 KERNEL_DISPATCH                    
  Profile:                 BASE_PROFILE                       
  Float Round Mode:        NEAR                               
  Max Queue Number:        128(0x80)                          
  Queue Min Size:          4096(0x1000)                       
  Queue Max Size:          131072(0x20000)                    
  Queue Type:              MULTI                              
  Node:                    1                                  
  Device Type:             GPU                                
  Cache Info:              
    L1:                      16(0x10) KB                        
  Chip ID:                 26751(0x687f)                      
  Cacheline Size:          64(0x40)                           
  Max Clock Freq. (MHz):   1590                               
  BDFID:                   7936                               
  Internal Node ID:        1                                  
  Compute Unit:            56                                 
  SIMDs per CU:            4                                  
  Shader Engines:          4                                  
  Shader Arrs. per Eng.:   1                                  
  WatchPts on Addr. Ranges:4                                  
  Features:                KERNEL_DISPATCH 
  Fast F16 Operation:      FALSE                              
  Wavefront Size:          64(0x40)                           
  Workgroup Max Size:      1024(0x400)                        
  Workgroup Max Size per Dimension:
    x                        1024(0x400)                        
    y                        1024(0x400)                        
    z                        1024(0x400)                        
  Max Waves Per CU:        40(0x28)                           
  Max Work-item Per CU:    2560(0xa00)                        
  Grid Max Size:           4294967295(0xffffffff)             
  Grid Max Size per Dimension:
    x                        4294967295(0xffffffff)             
    y                        4294967295(0xffffffff)             
    z                        4294967295(0xffffffff)             
  Max fbarriers/Workgrp:   32                                 
  Pool Info:               
    Pool 1                   
      Segment:                 GLOBAL; FLAGS: COARSE GRAINED      
      Size:                    8372224(0x7fc000) KB               
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       FALSE                              
    Pool 2                   
      Segment:                 GROUP                              
      Size:                    64(0x40) KB                        
      Allocatable:             FALSE                              
      Alloc Granule:           0KB                                
      Alloc Alignment:         0KB                                
      Accessible by all:       FALSE                              
  ISA Info:                
    ISA 1                    
      Name:                    amdgcn-amd-amdhsa--gfx900          
      Machine Models:          HSA_MACHINE_MODEL_LARGE            
      Profiles:                HSA_PROFILE_BASE                   
      Default Rounding Mode:   NEAR                               
      Default Rounding Mode:   NEAR                               
      Fast f16:                TRUE                               
      Workgroup Max Size:      1024(0x400)                        
      Workgroup Max Size per Dimension:
        x                        1024(0x400)                        
        y                        1024(0x400)                        
        z                        1024(0x400)                        
      Grid Max Size:           4294967295(0xffffffff)             
      Grid Max Size per Dimension:
        x                        4294967295(0xffffffff)             
        y                        4294967295(0xffffffff)             
        z                        4294967295(0xffffffff)             
      FBarrier Max Size:       32                                 
*** Done ***             

tpkessler avatar Aug 01 '20 11:08 tpkessler

[AMD Public Use]

Hi Torsten Thanks for reporting this issue. I can reproduce the issue here. Looking into it a bit more, and adding it to our regularly run tests.

Ron From: Torsten Keßler [email protected] Sent: Saturday, August 1, 2020 6:17 AM To: ROCm-Developer-Tools/aomp [email protected] Cc: Subscribed [email protected] Subject: [ROCm-Developer-Tools/aomp] [aomp] Memory access fault with custom mapper (#122)

[CAUTION: External Email]

I'm using AOMP 11.7-1 STANDALONE compiled from source on Arch Linux. When compiling OpenMP code with a custom mapper I always get a memory access fault.

The following code is taken from the official OpenMP 5.0 examples:

#include <stdlib.h>

#include <stdio.h>

#define N 100

typedef struct myvec{

size_t len;

double *data;

} myvec_t;

#pragma omp declare mapper(myvec_t v) \

map(v, v.data[0:v.len])

void init(myvec_t *s);

int main(){

myvec_t s;

s.data = (double *)calloc(N,sizeof(double));

s.len = N;

#pragma omp target map(s)

init(&s);



printf("s.data[%d]=%lf\n",N-1,s.data[N-1]);

}

void init(myvec_t *s)

{ for(int i=0; ilen; i++) s->data[i]=i; }

$ $AOMP/bin/hipcc -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=$($AOMP/bin/mygpu) target_mapper.1.c -o target_mapper.1

$ ./target_mapper.1

[GPU Memory Error] Addr: 0x1888000 Reason: Page not present or supervisor privilege.

Memory access fault by GPU node-1 (Agent handle: 0x188db50) on address 0x1888000. Reason: Page not present or supervisor privilege.

[1] 12891 abort (core dumped) ./target_mapper.1

My system:

$ $AOMP/bin/rocminfo

ROCk module is loaded

Able to open /dev/kfd read-write

=====================

HSA System Attributes

=====================

Runtime Version: 1.1

System Timestamp Freq.: 1000.000000MHz

Sig. Max Wait Duration: 18446744073709551615 (0xFFFFFFFFFFFFFFFF) (timestamp count)

Machine Model: LARGE

System Endianness: LITTLE

==========

HSA Agents

==========


Agent 1


Name: AMD Ryzen 7 2700X Eight-Core Processor

Uuid: CPU-XX

Marketing Name: AMD Ryzen 7 2700X Eight-Core Processor

Vendor Name: CPU

Feature: None specified

Profile: FULL_PROFILE

Float Round Mode: NEAR

Max Queue Number: 0(0x0)

Queue Min Size: 0(0x0)

Queue Max Size: 0(0x0)

Queue Type: MULTI

Node: 0

Device Type: CPU

Cache Info:

L1:                      32768(0x8000) KB

Chip ID: 0(0x0)

Cacheline Size: 64(0x40)

Max Clock Freq. (MHz): 3700

BDFID: 0

Internal Node ID: 0

Compute Unit: 16

SIMDs per CU: 0

Shader Engines: 0

Shader Arrs. per Eng.: 0

WatchPts on Addr. Ranges:1

Features: None

Pool Info:

Pool 1

  Segment:                 GLOBAL; FLAGS: KERNARG, FINE GRAINED

  Size:                    32896456(0x1f5f5c8) KB

  Allocatable:             TRUE

  Alloc Granule:           4KB

  Alloc Alignment:         4KB

  Accessible by all:       TRUE

Pool 2

  Segment:                 GLOBAL; FLAGS: COARSE GRAINED

  Size:                    32896456(0x1f5f5c8) KB

  Allocatable:             TRUE

  Alloc Granule:           4KB

  Alloc Alignment:         4KB

  Accessible by all:       TRUE

ISA Info:

N/A

Agent 2


Name: gfx900

Uuid: GPU-0213f2a912ee21a4

Marketing Name: Vega 10 XL/XT [Radeon RX Vega 56/64]

Vendor Name: AMD

Feature: KERNEL_DISPATCH

Profile: BASE_PROFILE

Float Round Mode: NEAR

Max Queue Number: 128(0x80)

Queue Min Size: 4096(0x1000)

Queue Max Size: 131072(0x20000)

Queue Type: MULTI

Node: 1

Device Type: GPU

Cache Info:

L1:                      16(0x10) KB

Chip ID: 26751(0x687f)

Cacheline Size: 64(0x40)

Max Clock Freq. (MHz): 1590

BDFID: 7936

Internal Node ID: 1

Compute Unit: 56

SIMDs per CU: 4

Shader Engines: 4

Shader Arrs. per Eng.: 1

WatchPts on Addr. Ranges:4

Features: KERNEL_DISPATCH

Fast F16 Operation: FALSE

Wavefront Size: 64(0x40)

Workgroup Max Size: 1024(0x400)

Workgroup Max Size per Dimension:

x                        1024(0x400)

y                        1024(0x400)

z                        1024(0x400)

Max Waves Per CU: 40(0x28)

Max Work-item Per CU: 2560(0xa00)

Grid Max Size: 4294967295(0xffffffff)

Grid Max Size per Dimension:

x                        4294967295(0xffffffff)

y                        4294967295(0xffffffff)

z                        4294967295(0xffffffff)

Max fbarriers/Workgrp: 32

Pool Info:

Pool 1

  Segment:                 GLOBAL; FLAGS: COARSE GRAINED

  Size:                    8372224(0x7fc000) KB

  Allocatable:             TRUE

  Alloc Granule:           4KB

  Alloc Alignment:         4KB

  Accessible by all:       FALSE

Pool 2

  Segment:                 GROUP

  Size:                    64(0x40) KB

  Allocatable:             FALSE

  Alloc Granule:           0KB

  Alloc Alignment:         0KB

  Accessible by all:       FALSE

ISA Info:

ISA 1

  Name:                    amdgcn-amd-amdhsa--gfx900

  Machine Models:          HSA_MACHINE_MODEL_LARGE

  Profiles:                HSA_PROFILE_BASE

  Default Rounding Mode:   NEAR

  Default Rounding Mode:   NEAR

  Fast f16:                TRUE

  Workgroup Max Size:      1024(0x400)

  Workgroup Max Size per Dimension:

    x                        1024(0x400)

    y                        1024(0x400)

    z                        1024(0x400)

  Grid Max Size:           4294967295(0xffffffff)

  Grid Max Size per Dimension:

    x                        4294967295(0xffffffff)

    y                        4294967295(0xffffffff)

    z                        4294967295(0xffffffff)

  FBarrier Max Size:       32

*** Done ***

You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHubhttps://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FROCm-Developer-Tools%2Faomp%2Fissues%2F122&data=02%7C01%7Cron.lieberman%40amd.com%7C31243c43bde04724e8cb08d8360c6f2c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637318774316132301&sdata=5o9kAyRjrv38EZz7R92GiMzkXAAD3e51MHrGh88ZDjk%3D&reserved=0, or unsubscribehttps://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Fnotifications%2Funsubscribe-auth%2FAD3EYZ4FXDCW7TB7Z5LDTZDR6P2TLANCNFSM4PRVBK4A&data=02%7C01%7Cron.lieberman%40amd.com%7C31243c43bde04724e8cb08d8360c6f2c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637318774316132301&sdata=nbBsQKw0SlNT9sIc5EWF%2FxLwhcORxgW3zwjWXcscFBw%3D&reserved=0.

ronlieb avatar Aug 01 '20 11:08 ronlieb

[AMD Public Use]

I modified the example and can get it to run. I think the issues is that the mapper is accessing host memory on device. I changed it to do target allocation prior to mapping.

Cc'ed Kent Milfeld who leads our openmp examples efforts in the community.

Ron

diff --git a/test/omp5/mapper_prob/mapper_prob.c b/test/omp5/mapper_prob/mapper_prob.c index 4a9f8e1..07afc41 100644 --- a/test/omp5/mapper_prob/mapper_prob.c +++ b/test/omp5/mapper_prob/mapper_prob.c @@ -1,5 +1,6 @@ #include <stdlib.h> #include <stdio.h> +#include <omp.h> #define N 100 typedef struct myvec{ size_t len; @@ -12,12 +13,13 @@ void init(myvec_t *s);

int main(){ myvec_t s;

  • s.data = (double *)calloc(N,sizeof(double));
  • s.data = (double *)omp_target_alloc(N * sizeof(double), /*device: */0); s.len = N; #pragma omp target map(s) init(&s);

    printf("s.data[%d]=%lf\n",N-1,s.data[N-1]);

  • omp_target_free(s.data, /*Device: */0); return 0; } void init(myvec_t *s)

From: ronlieb [email protected] Sent: Saturday, August 1, 2020 6:40 AM To: ROCm-Developer-Tools/aomp [email protected] Cc: Lieberman, Ron [email protected]; Your activity [email protected] Subject: Re: [ROCm-Developer-Tools/aomp] [aomp] Memory access fault with custom mapper (#122)

[CAUTION: External Email] [AMD Public Use]

Hi Torsten Thanks for reporting this issue. I can reproduce the issue here. Looking into it a bit more, and adding it to our regularly run tests.

Ron From: Torsten Keßler <[email protected]mailto:[email protected]> Sent: Saturday, August 1, 2020 6:17 AM To: ROCm-Developer-Tools/aomp <[email protected]mailto:[email protected]> Cc: Subscribed <[email protected]mailto:[email protected]> Subject: [ROCm-Developer-Tools/aomp] [aomp] Memory access fault with custom mapper (#122)

[CAUTION: External Email]

I'm using AOMP 11.7-1 STANDALONE compiled from source on Arch Linux. When compiling OpenMP code with a custom mapper I always get a memory access fault.

The following code is taken from the official OpenMP 5.0 examples:

#include <stdlib.h>

#include <stdio.h>

#define N 100

typedef struct myvec{

size_t len;

double *data;

} myvec_t;

#pragma omp declare mapper(myvec_t v) \

map(v, v.data[0:v.len])

void init(myvec_t *s);

int main(){

myvec_t s;

s.data = (double *)calloc(N,sizeof(double));

s.len = N;

#pragma omp target map(s)

init(&s);

printf("s.data[%d]=%lf\n",N-1,s.data[N-1]);

}

void init(myvec_t *s)

{ for(int i=0; ilen; i++) s->data[i]=i; }

$ $AOMP/bin/hipcc -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=$($AOMP/bin/mygpu) target_mapper.1.c -o target_mapper.1

$ ./target_mapper.1

[GPU Memory Error] Addr: 0x1888000 Reason: Page not present or supervisor privilege.

Memory access fault by GPU node-1 (Agent handle: 0x188db50) on address 0x1888000. Reason: Page not present or supervisor privilege.

[1] 12891 abort (core dumped) ./target_mapper.1

My system:

$ $AOMP/bin/rocminfo

ROCk module is loaded

Able to open /dev/kfd read-write

=====================

HSA System Attributes

=====================

Runtime Version: 1.1

System Timestamp Freq.: 1000.000000MHz

Sig. Max Wait Duration: 18446744073709551615 (0xFFFFFFFFFFFFFFFF) (timestamp count)

Machine Model: LARGE

System Endianness: LITTLE

==========

HSA Agents

==========


Agent 1


Name: AMD Ryzen 7 2700X Eight-Core Processor

Uuid: CPU-XX

Marketing Name: AMD Ryzen 7 2700X Eight-Core Processor

Vendor Name: CPU

Feature: None specified

Profile: FULL_PROFILE

Float Round Mode: NEAR

Max Queue Number: 0(0x0)

Queue Min Size: 0(0x0)

Queue Max Size: 0(0x0)

Queue Type: MULTI

Node: 0

Device Type: CPU

Cache Info:

L1: 32768(0x8000) KB

Chip ID: 0(0x0)

Cacheline Size: 64(0x40)

Max Clock Freq. (MHz): 3700

BDFID: 0

Internal Node ID: 0

Compute Unit: 16

SIMDs per CU: 0

Shader Engines: 0

Shader Arrs. per Eng.: 0

WatchPts on Addr. Ranges:1

Features: None

Pool Info:

Pool 1

Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED

Size: 32896456(0x1f5f5c8) KB

Allocatable: TRUE

Alloc Granule: 4KB

Alloc Alignment: 4KB

Accessible by all: TRUE

Pool 2

Segment: GLOBAL; FLAGS: COARSE GRAINED

Size: 32896456(0x1f5f5c8) KB

Allocatable: TRUE

Alloc Granule: 4KB

Alloc Alignment: 4KB

Accessible by all: TRUE

ISA Info:

N/A


Agent 2


Name: gfx900

Uuid: GPU-0213f2a912ee21a4

Marketing Name: Vega 10 XL/XT [Radeon RX Vega 56/64]

Vendor Name: AMD

Feature: KERNEL_DISPATCH

Profile: BASE_PROFILE

Float Round Mode: NEAR

Max Queue Number: 128(0x80)

Queue Min Size: 4096(0x1000)

Queue Max Size: 131072(0x20000)

Queue Type: MULTI

Node: 1

Device Type: GPU

Cache Info:

L1: 16(0x10) KB

Chip ID: 26751(0x687f)

Cacheline Size: 64(0x40)

Max Clock Freq. (MHz): 1590

BDFID: 7936

Internal Node ID: 1

Compute Unit: 56

SIMDs per CU: 4

Shader Engines: 4

Shader Arrs. per Eng.: 1

WatchPts on Addr. Ranges:4

Features: KERNEL_DISPATCH

Fast F16 Operation: FALSE

Wavefront Size: 64(0x40)

Workgroup Max Size: 1024(0x400)

Workgroup Max Size per Dimension:

x 1024(0x400)

y 1024(0x400)

z 1024(0x400)

Max Waves Per CU: 40(0x28)

Max Work-item Per CU: 2560(0xa00)

Grid Max Size: 4294967295(0xffffffff)

Grid Max Size per Dimension:

x 4294967295(0xffffffff)

y 4294967295(0xffffffff)

z 4294967295(0xffffffff)

Max fbarriers/Workgrp: 32

Pool Info:

Pool 1

Segment: GLOBAL; FLAGS: COARSE GRAINED

Size: 8372224(0x7fc000) KB

Allocatable: TRUE

Alloc Granule: 4KB

Alloc Alignment: 4KB

Accessible by all: FALSE

Pool 2

Segment: GROUP

Size: 64(0x40) KB

Allocatable: FALSE

Alloc Granule: 0KB

Alloc Alignment: 0KB

Accessible by all: FALSE

ISA Info:

ISA 1

Name: amdgcn-amd-amdhsa--gfx900

Machine Models: HSA_MACHINE_MODEL_LARGE

Profiles: HSA_PROFILE_BASE

Default Rounding Mode: NEAR

Default Rounding Mode: NEAR

Fast f16: TRUE

Workgroup Max Size: 1024(0x400)

Workgroup Max Size per Dimension:

x 1024(0x400)

y 1024(0x400)

z 1024(0x400)

Grid Max Size: 4294967295(0xffffffff)

Grid Max Size per Dimension:

x 4294967295(0xffffffff)

y 4294967295(0xffffffff)

z 4294967295(0xffffffff)

FBarrier Max Size: 32

*** Done ***

You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHubhttps://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FROCm-Developer-Tools%2Faomp%2Fissues%2F122&data=02%7C01%7Cron.lieberman%40amd.com%7C31243c43bde04724e8cb08d8360c6f2c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637318774316132301&sdata=5o9kAyRjrv38EZz7R92GiMzkXAAD3e51MHrGh88ZDjk%3D&reserved=0, or unsubscribehttps://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Fnotifications%2Funsubscribe-auth%2FAD3EYZ4FXDCW7TB7Z5LDTZDR6P2TLANCNFSM4PRVBK4A&data=02%7C01%7Cron.lieberman%40amd.com%7C31243c43bde04724e8cb08d8360c6f2c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637318774316132301&sdata=nbBsQKw0SlNT9sIc5EWF%2FxLwhcORxgW3zwjWXcscFBw%3D&reserved=0.

You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHubhttps://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FROCm-Developer-Tools%2Faomp%2Fissues%2F122%23issuecomment-667518304&data=02%7C01%7Cron.lieberman%40amd.com%7C5774eb4695e84d798d5808d8360f96bf%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637318787870803115&sdata=12BR%2B1O3UyWowG691DVyXTAIRDw1xlt98fAeORaDdfc%3D&reserved=0, or unsubscribehttps://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Fnotifications%2Funsubscribe-auth%2FAD3EYZ2YGZRSAV25YZP5XD3R6P5IBANCNFSM4PRVBK4A&data=02%7C01%7Cron.lieberman%40amd.com%7C5774eb4695e84d798d5808d8360f96bf%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637318787870813116&sdata=pZkVpKvrfRLgF28eap5d7pRYQrINPeGCbv%2FKH1GWEEQ%3D&reserved=0.

ronlieb avatar Aug 01 '20 11:08 ronlieb

Hey Ron, thank you very much for your quick reply. Applying your changes I get a segmentation fault because printf() tries to access memory on allocated the device. The code works if I add a

#pragma omp target

before printf().

When I replace the custom mapper by explicitly mapping myvec_t,

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#define N 100
typedef struct myvec{
    size_t len;
    double *data;
} myvec_t;

void init(myvec_t *s);

int main(){
    myvec_t s;
    s.data = (double *)calloc(N,sizeof(double));
    s.len = N;

    #pragma omp target map(tofrom: s, s.data[:s.len])
    init(&s);
    
    printf("s.data[%d]=%lf\n",N-1,s.data[N-1]);
}

void init(myvec_t *s)
{ for(int i=0; i<s->len; i++) s->data[i]=i; }

I get a different error:

[/build/aomp-amdgpu/src/amd-llvm-project/openmp/libomptarget/plugins/hsa/impl/data.cpp:200] Copy async between memory pools failed: HSA_STATUS_ERROR_INVALID_ARGUMENT

Offloading a bare array works, but not if it's combined in a struct:

#include <stdio.h>

#define N 10lu

typedef struct{
    size_t len;
    double *data;
} myvec_t;

int main()
{
    double x[N];

    /* This code works */
    #pragma omp target map(x[:N])
    for(size_t i=0; i < N; i++){
        x[i] = 1.0 * i;
    }
    printf("x[%zu] = %g\n", N-1, x[N-1]);

    double y[N];
    myvec_t s = {.len = N, .data = &y[0]};

    /* This doesn't work */
    #pragma omp target map(s, s.data[:s.len])
    for(size_t i=0; i < s.len; i++){
        s.data[i] = 2.0 * i;
    }
    printf("s.data[%zu] = %g\n", N-1, s.data[N-1]);

    return 0;
}

Output (same as above):

x[9] = 9
[/build/aomp-amdgpu/src/amd-llvm-project/openmp/libomptarget/plugins/hsa/impl/data.cpp:200] Copy async between memory pools failed: HSA_STATUS_ERROR_INVALID_ARGUMENT

Torsten

tpkessler avatar Aug 02 '20 12:08 tpkessler

Hi Ron,

Is this example being used on a unified shared memory (USM) machine.

If not USM

The idea of directly allocating on the device and storing the device pointer as a member function is a reasonable thing to do (now that I've seen you try this). But there are some other things that need to be done to achieve this correctly. For instance, s.data must be specified (mapped) as a device pointer.

The mapper should map both the myvec_t structure (v) and the storage behind v.data (that is allocated
on the host), as described in the declare mapper directive.  I suspect it might be a bug, and there is no
attachment occurring on the device to the "data" storage and/or the "data" storage is not being created
and copied from the original variable (data on the host).

If USM:

This is always tricky for me, because (I think) map clauses are ignored, for the most part.
However, I'm not sure what the Spec says about mappers being used with USM.

We should talk to Deepak tomorrow about this.

Best, Kent


From: Lieberman, Ron [email protected] Sent: Saturday, August 1, 2020 6:57 AM To: ROCm-Developer-Tools/aomp [email protected]; ROCm-Developer-Tools/aomp [email protected] Cc: Your activity [email protected]; Kent Milfeld [email protected] Subject: RE: [ROCm-Developer-Tools/aomp] [aomp] Memory access fault with custom mapper (#122)

[AMD Public Use]

I modified the example and can get it to run.

I think the issues is that the mapper is accessing host memory on device.

I changed it to do target allocation prior to mapping.

Cc’ed Kent Milfeld who leads our openmp examples efforts in the community.

Ron

diff --git a/test/omp5/mapper_prob/mapper_prob.c b/test/omp5/mapper_prob/mapper_prob.c

index 4a9f8e1..07afc41 100644

--- a/test/omp5/mapper_prob/mapper_prob.c

+++ b/test/omp5/mapper_prob/mapper_prob.c

@@ -1,5 +1,6 @@

#include <stdlib.h>

#include <stdio.h>

+#include <omp.h>

#define N 100

typedef struct myvec{

 size_t len;

@@ -12,12 +13,13 @@ void init(myvec_t *s);

int main(){

 myvec_t s;
  • s.data = (double *)calloc(N,sizeof(double));
  • s.data = (double *)omp_target_alloc(N * sizeof(double), /*device: */0);

    s.len = N;

    #pragma omp target map(s)

    init(&s);

    printf("s.data[%d]=%lf\n",N-1,s.data[N-1]);

  • omp_target_free(s.data, /*Device: */0);

    return 0;

}

void init(myvec_t *s)

From: ronlieb [email protected] Sent: Saturday, August 1, 2020 6:40 AM To: ROCm-Developer-Tools/aomp [email protected] Cc: Lieberman, Ron [email protected]; Your activity [email protected] Subject: Re: [ROCm-Developer-Tools/aomp] [aomp] Memory access fault with custom mapper (#122)

[CAUTION: External Email]

[AMD Public Use]

Hi Torsten Thanks for reporting this issue. I can reproduce the issue here. Looking into it a bit more, and adding it to our regularly run tests.

Ron From: Torsten Keßler <[email protected]mailto:[email protected]> Sent: Saturday, August 1, 2020 6:17 AM To: ROCm-Developer-Tools/aomp <[email protected]mailto:[email protected]> Cc: Subscribed <[email protected]mailto:[email protected]> Subject: [ROCm-Developer-Tools/aomp] [aomp] Memory access fault with custom mapper (#122)

[CAUTION: External Email]

I'm using AOMP 11.7-1 STANDALONE compiled from source on Arch Linux. When compiling OpenMP code with a custom mapper I always get a memory access fault.

The following code is taken from the official OpenMP 5.0 examples:

#include <stdlib.h>

#include <stdio.h>

#define N 100

typedef struct myvec{

size_t len;

double *data;

} myvec_t;

#pragma omp declare mapper(myvec_t v) \

map(v, v.data[0:v.len])

void init(myvec_t *s);

int main(){

myvec_t s;

s.data = (double *)calloc(N,sizeof(double));

s.len = N;

#pragma omp target map(s)

init(&s);

printf("s.data[%d]=%lf\n",N-1,s.data[N-1]);

}

void init(myvec_t *s)

{ for(int i=0; ilen; i++) s->data[i]=i; }

$ $AOMP/bin/hipcc -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=$($AOMP/bin/mygpu) target_mapper.1.c -o target_mapper.1

$ ./target_mapper.1

[GPU Memory Error] Addr: 0x1888000 Reason: Page not present or supervisor privilege.

Memory access fault by GPU node-1 (Agent handle: 0x188db50) on address 0x1888000. Reason: Page not present or supervisor privilege.

[1] 12891 abort (core dumped) ./target_mapper.1

My system:

$ $AOMP/bin/rocminfo

ROCk module is loaded

Able to open /dev/kfd read-write

=====================

HSA System Attributes

=====================

Runtime Version: 1.1

System Timestamp Freq.: 1000.000000MHz

Sig. Max Wait Duration: 18446744073709551615 (0xFFFFFFFFFFFFFFFF) (timestamp count)

Machine Model: LARGE

System Endianness: LITTLE

==========

HSA Agents

==========


Agent 1


Name: AMD Ryzen 7 2700X Eight-Core Processor

Uuid: CPU-XX

Marketing Name: AMD Ryzen 7 2700X Eight-Core Processor

Vendor Name: CPU

Feature: None specified

Profile: FULL_PROFILE

Float Round Mode: NEAR

Max Queue Number: 0(0x0)

Queue Min Size: 0(0x0)

Queue Max Size: 0(0x0)

Queue Type: MULTI

Node: 0

Device Type: CPU

Cache Info:

L1: 32768(0x8000) KB

Chip ID: 0(0x0)

Cacheline Size: 64(0x40)

Max Clock Freq. (MHz): 3700

BDFID: 0

Internal Node ID: 0

Compute Unit: 16

SIMDs per CU: 0

Shader Engines: 0

Shader Arrs. per Eng.: 0

WatchPts on Addr. Ranges:1

Features: None

Pool Info:

Pool 1

Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED

Size: 32896456(0x1f5f5c8) KB

Allocatable: TRUE

Alloc Granule: 4KB

Alloc Alignment: 4KB

Accessible by all: TRUE

Pool 2

Segment: GLOBAL; FLAGS: COARSE GRAINED

Size: 32896456(0x1f5f5c8) KB

Allocatable: TRUE

Alloc Granule: 4KB

Alloc Alignment: 4KB

Accessible by all: TRUE

ISA Info:

N/A


Agent 2


Name: gfx900

Uuid: GPU-0213f2a912ee21a4

Marketing Name: Vega 10 XL/XT [Radeon RX Vega 56/64]

Vendor Name: AMD

Feature: KERNEL_DISPATCH

Profile: BASE_PROFILE

Float Round Mode: NEAR

Max Queue Number: 128(0x80)

Queue Min Size: 4096(0x1000)

Queue Max Size: 131072(0x20000)

Queue Type: MULTI

Node: 1

Device Type: GPU

Cache Info:

L1: 16(0x10) KB

Chip ID: 26751(0x687f)

Cacheline Size: 64(0x40)

Max Clock Freq. (MHz): 1590

BDFID: 7936

Internal Node ID: 1

Compute Unit: 56

SIMDs per CU: 4

Shader Engines: 4

Shader Arrs. per Eng.: 1

WatchPts on Addr. Ranges:4

Features: KERNEL_DISPATCH

Fast F16 Operation: FALSE

Wavefront Size: 64(0x40)

Workgroup Max Size: 1024(0x400)

Workgroup Max Size per Dimension:

x 1024(0x400)

y 1024(0x400)

z 1024(0x400)

Max Waves Per CU: 40(0x28)

Max Work-item Per CU: 2560(0xa00)

Grid Max Size: 4294967295(0xffffffff)

Grid Max Size per Dimension:

x 4294967295(0xffffffff)

y 4294967295(0xffffffff)

z 4294967295(0xffffffff)

Max fbarriers/Workgrp: 32

Pool Info:

Pool 1

Segment: GLOBAL; FLAGS: COARSE GRAINED

Size: 8372224(0x7fc000) KB

Allocatable: TRUE

Alloc Granule: 4KB

Alloc Alignment: 4KB

Accessible by all: FALSE

Pool 2

Segment: GROUP

Size: 64(0x40) KB

Allocatable: FALSE

Alloc Granule: 0KB

Alloc Alignment: 0KB

Accessible by all: FALSE

ISA Info:

ISA 1

Name: amdgcn-amd-amdhsa--gfx900

Machine Models: HSA_MACHINE_MODEL_LARGE

Profiles: HSA_PROFILE_BASE

Default Rounding Mode: NEAR

Default Rounding Mode: NEAR

Fast f16: TRUE

Workgroup Max Size: 1024(0x400)

Workgroup Max Size per Dimension:

x 1024(0x400)

y 1024(0x400)

z 1024(0x400)

Grid Max Size: 4294967295(0xffffffff)

Grid Max Size per Dimension:

x 4294967295(0xffffffff)

y 4294967295(0xffffffff)

z 4294967295(0xffffffff)

FBarrier Max Size: 32

*** Done ***

You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHubhttps://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FROCm-Developer-Tools%2Faomp%2Fissues%2F122&data=02%7C01%7Cron.lieberman%40amd.com%7C31243c43bde04724e8cb08d8360c6f2c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637318774316132301&sdata=5o9kAyRjrv38EZz7R92GiMzkXAAD3e51MHrGh88ZDjk%3D&reserved=0, or unsubscribehttps://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Fnotifications%2Funsubscribe-auth%2FAD3EYZ4FXDCW7TB7Z5LDTZDR6P2TLANCNFSM4PRVBK4A&data=02%7C01%7Cron.lieberman%40amd.com%7C31243c43bde04724e8cb08d8360c6f2c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637318774316132301&sdata=nbBsQKw0SlNT9sIc5EWF%2FxLwhcORxgW3zwjWXcscFBw%3D&reserved=0.

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHubhttps://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FROCm-Developer-Tools%2Faomp%2Fissues%2F122%23issuecomment-667518304&data=02%7C01%7Cron.lieberman%40amd.com%7C5774eb4695e84d798d5808d8360f96bf%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637318787870803115&sdata=12BR%2B1O3UyWowG691DVyXTAIRDw1xlt98fAeORaDdfc%3D&reserved=0, or unsubscribehttps://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2Fnotifications%2Funsubscribe-auth%2FAD3EYZ2YGZRSAV25YZP5XD3R6P5IBANCNFSM4PRVBK4A&data=02%7C01%7Cron.lieberman%40amd.com%7C5774eb4695e84d798d5808d8360f96bf%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637318787870813116&sdata=pZkVpKvrfRLgF28eap5d7pRYQrINPeGCbv%2FKH1GWEEQ%3D&reserved=0.

ronlieb avatar Aug 03 '20 14:08 ronlieb

Update: With the new 11.8-0 release I get the following error

x[9] = 9
[/build/aomp-amdgpu/src/amd-llvm-project/openmp/libomptarget/plugins/hsa/impl/data.cpp:99] atmi_malloc failed: HSA_STATUS_ERROR_INVALID_ARGUMENT

tpkessler avatar Aug 22 '20 09:08 tpkessler

Update for 11.9-0:

x[9] = 9
[/build/aomp-amdgpu/src/amd-llvm-project/openmp/libomptarget/plugins/hsa/impl/data.cpp:100] atmi_malloc failed: HSA_STATUS_ERROR_INVALID_ARGUMENT

tpkessler avatar Sep 18 '20 12:09 tpkessler

Update for 11.11-0:

x[9] = 9
[/build/aomp-amdgpu/src/amd-llvm-project/openmp/libomptarget/plugins/amdgpu/impl/data.cpp:61] atmi_malloc failed: HSA_STATUS_ERROR_INVALID_ARGUMENT

tpkessler avatar Nov 01 '20 12:11 tpkessler

@estewart08 Do we have this in a smoke test? @tpkessler Torsten, can you check this with AOMP 13.0-2 with ROCm 4.1 dkms kernel.
After modinfo should show you are on 5.9.15. Run this command. modinfo -F version amdgpu

gregrodgers avatar Apr 20 '21 12:04 gregrodgers

we have a test case in aomp/test/omp5/mapper_prob this compiles and runs with amd-stg-open , aomp 11.8-0 11.9-2 11.11-* but not aomp_13.0-2

ronlieb avatar May 09 '21 21:05 ronlieb

@gregrodgers: I've tested the following examples with clangfrom the latest ROCm docker image and with aomp-13.0-2 compiled from source with kernel 5.12.5 (Arch Linux) on a Vega 56.

@ronlieb : Your test directly allocates memory on the device. The crucial part of the map directive (and the custom mapper) is that OpenMP takes care of that. Otherwise only the pointer address is mapped, see my third example.

I used the following flags:

$AOMP/bin/clang -g -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=$($AOMP/bin/mygpu)

The first example is the official OpenMP example I already used in my first post.

target_mapper.1.c
#include <stdlib.h>
#include <stdio.h>
#define N 100
typedef struct myvec{
    size_t len;
    double *data;
} myvec_t;

#pragma omp declare mapper(myvec_t v) \
    map(v, v.data[0:v.len])
void init(myvec_t *s);

int main(){
    myvec_t s;
    s.data = (double *)calloc(N,sizeof(double));
    s.len = N;
    #pragma omp target map(s)
    init(&s);
    
    printf("s.data[%d]=%lf\n",N-1,s.data[N-1]);
}

void init(myvec_t *s)
{ for(int i=0; i<s->len; i++) s->data[i]=i; }

output

Libomptarget message: explicit extension not allowed: host address specified is 0x00007ffe08309e00 (16 bytes), but device allocation maps to host at 0x00007ffe08309e08 (8 bytes)
Libomptarget error: Call to getOrAllocTgtPtr returned null pointer (device failure or illegal mapping).
Libomptarget error: Call to targetDataBegin via targetDataMapper for custom mapper failed.
Libomptarget error: Call to targetDataBegin failed, abort target.
Libomptarget error: Failed to process data before launching the kernel.
Libomptarget error: Run with LIBOMPTARGET_DEBUG=4 to dump host-target pointer mappings.
target_mapper.1.c:23:5: Libomptarget fatal error 1: failure of target construct while offloading is mandatory

If I manually map the struct I get a different error message (probably due to s.data[:s.len])

target_mapper.2.c
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#define N 100
typedef struct myvec{
    size_t len;
    double *data;
} myvec_t;

void init(myvec_t *s);

int main(){
    myvec_t s;
    s.data = (double *)calloc(N,sizeof(double));
    s.len = N;
    #pragma omp target map(s, s.data[:s.len])
    init(&s);
    
    printf("s.data[%d]=%lf\n",N-1,s.data[N-1]);
}

void init(myvec_t *s)
{ for(int i=0; i<s->len; i++) s->data[i]=i; }

output

/home/torsten/Downloads/aomp13/llvm-project/openmp/libomptarget/plugins/amdgpu/impl/data.cpp:60] atmi_malloc failed: HSA_STATUS_ERROR_INVALID_ARGUMENT

Note that mapping the whole array is crucial. Otherwise, only the value of the pointer (that is its address in host RAM) is mapped.

target_mapper.3.c
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#define N 100
typedef struct myvec{
    size_t len;
    double *data;
} myvec_t;

int main(){
    myvec_t s;
    s.data = (double *)calloc(N,sizeof(double));
    if(!s.data){
        fprintf(stderr, "alloc failed\n");
        exit(1);
    }
    s.len = N;
    printf("CPU: Array at %p with length %zu\n", s.data, s.len);
    #pragma omp target map(s)
    printf("GPU: Array at %p with length %zu\n", s.data, s.len);
}

output

CPU: Array at 0x1180340 with length 100
GPU: Array at 0x1180340 with length 100

tpkessler avatar May 22 '21 16:05 tpkessler

I believe this is now fixed. @tpkessler Can you please check again and let me know if you see any problems?

carlobertolli avatar Feb 14 '22 14:02 carlobertolli

Yes @carlobertolli, my sample programs work with the latest release. Thank you!

tpkessler avatar Mar 01 '22 14:03 tpkessler

Closing this as the sample program now works.

jplehr avatar Jan 11 '23 10:01 jplehr